Question

我有一个计算大量数据并将其写入文件的程序。我的数据只是一堆0-16的数字（17个不同的值），我计算了每个数字出现在数据中的频率。我需要使用尽可能少的磁盘空间和尽可能少的RAM，所以我在纯python中编写了一个小的霍夫曼编码/解码模块，它一次用内存中的编码符号来编写/读取压缩数据。是否有一个python附带的模块可以做类似的事情？下面是代码，其中包含一些如何使用它的示例（警告代码有点长篇评论）：

def makeTree(data):
    """data is a list of tuples, whos first entry is a priority/frequency
    number, and whos second entry is tuple containing the data to
    be encoded. The tree uses an internal tag system to tell where the
    branch ends. (End nodes are terminated with a False)"""
    def tag(data):
        taggedData = []
        for priority, datum in data:
            #all of the initial data is an end branch
            taggedData += [(priority, (datum, False))]
        return taggedData
    #get the tagged data into decending order of priority/frequency
    tree = sorted(tag(data), reverse=True)
    while len(tree)>1:
        #get the two lowest priority branches
        bottomR, bottomL = tree.pop(), tree.pop()
        #and stick them together into a new branch with combined priority
        new_elem = bottomR[0]+bottomL[0], ((bottomL, bottomR), True)
        #then add them back to the list of branches and sort
        tree += [new_elem]
        tree.sort(reverse=True)
    return tree[0]

def makeTable(tree, code=""):
    """Takes a tree such as generated by makeTree and returns a dictionary
    of code:value pairs."""
    #if this is an end branch, return code:value pair
    if tree[1][1]==False:
        return {code:tree[1][0]}
    #otherwise find the tables for the left and right branches
    #add them to the main table, and return
    table = {}
    table.update(makeTable(tree[1][0][0], code+'0')) #left
    table.update(makeTable(tree[1][0][1], code+'1')) #right
    return table

class Decoder:
    """this class creates a Decoder object which is used to decode a compressed
    file using the appropriate decoding table (duh). It used to be a function,
    but it was buggy and would also be ugly if I left it a function. (this
    class was written After the Encdoer class.)
    """
    def __init__(self, fname, table):
        self.file = open(fname)
        self.table = table
        self.byte = None
        self.bit = 7
        self.newByte = True

    def decode(self, size=1):
        """Decodes and yields size number of symbols from the file.
        Size defaults to 1"""
        #a counter for how many symbols were read
        read = 0
        code = ''
        while read<size:
            if self.newByte:
                self.byte = ord(self.file.read(1))
            for n in xrange(self.bit, -1, -1):
                code += str((self.byte & 1<<n) >> n)
                self.byte &= (1<<n)-1
                if code in self.table:
                    yield self.table[code]
                    read += 1
                    code = ''
                    if read==size:
                        self.bit = n-1
                        self.newByte = False
                        raise StopIteration
            self.bit = 7
            self.newByte = True

    def close(self):
        self.file.close()

class Encoder:
    """This class creates an encoder object, which is used to write encoded data
    to a file. It was initially going to be a function, but I couldn't
    accomplish that without code getting really ugly. :p """
    def __init__(self, fname, table):
        self.file = open(fname, 'w')
        self.table = table
        self.code = ''

    def encode(self, datum):
        """Attempts to write encoded datum to file. If their isn't enough
        code to write a whole number amount of bytes, then the code is saved up
        until there is."""
        self.code += self.table[datum]
        if len(self.code)%8==0:
            self.__write_code_chunk()
        return

    def end_encode(self):
        """Writes any remaining code to the file, appending the code with
        trailing zeros to fit within a byte, then closes the file."""
        #if the length of the code remaining isn't a multiple of 8 bits
        if len(self.code)%8:
            #then add zeros to the end so that it is
            self.code += "0"*(8 - len(self.code)%8)
        self.__write_code_chunk()
        self.file.close()
        return

    def __write_code_chunk(self):
        bytes = len(self.code)/8
        #for every byte (or every 8 bits)...
        for _ in xrange(bytes):
            #turn those bits into an number using int with base 2,
            #then turn the number into an ascii character,
            #and finally write the data to the file.
            self.file.write(chr(int(self.code[:8], 2)))
            #then get rid of the 8 bits just read
            self.code = self.code[8:]
        #make sure there is no code left over
        assert self.code==''
        return

if __name__=="__main__":
    import random

    mandelbrotData = [
        (0.10776733333333334, 0),
        (0.24859, 1),
        (0.12407666666666667, 2),
        (0.07718866666666667, 3),
        (0.04594733333333333, 4),
        (0.03356, 5),
        (0.023286666666666664, 6),
        (0.018338, 7),
        (0.014030666666666667, 8),
        (0.011918, 9),
        (0.009500666666666668, 10),
        (0.008396666666666667, 11),
        (0.006936, 12),
        (0.006365999999999999, 13),
        (0.005466, 14),
        (0.0048920000000000005, 15),
        (0.2537393333333333, 16)]
    decode_table = makeTable(makeTree(mandelbrotData))
    encode_table = {val:key for key, val in decode_table.iteritems()}
    approx_data = sum([[val]*int(round(freq*10**3/2)) for freq, val in mandelbrotData], [])
    random.shuffle(approx_data)

    testname = 'hufftest'
    encoder = Encoder(testname, encode_table)

    for val in approx_data:
        encoder.encode(val)
    encoder.end_encode()

    decoder = Decoder(testname, decode_table)
    decoded = list(decoder.decode(len(approx_data)/2))
    decoded += list(decoder.decode(len(approx_data)/2))
    print approx_data == decoded

是否有可以更快地完成类似操作的模块？如果没有，我有没有办法改变我的代码，使其更快？

Answer 1

在内存中，相同值的不同出现只会占用一个位置。但是，重复对它们的引用。
对于磁盘存储，我可能只是使用zlib压缩它。

Answer 2

如果您的数据显着重复，那么您可能希望尝试对其进行行程编码，这可能是一个相对较快的操作。这是一个作为生成器的实现，可以帮助最小化其总体内存使用。请注意，当运行非常短时，它只输出值而不是（重复计数，值）元组，以避免膨胀输出并可能使其比原来更长。

from itertools import groupby

def run_length_encode(data):
    for value, igroup in groupby(data):
        repeat_count = len(list(igroup))
        yield value if repeat_count == 1 else repeat_count, value

if __name__ == '__main__':
    """ generate some random data with repeats and encode it """
    import random

    DATA_SIZE = 20
    MAX_VAL = 16
    MAX_REPEAT = 5
    data = []
    while len(data) < DATA_SIZE:
        val = random.randint(0, MAX_VAL)
        repeat = min(DATA_SIZE-len(data), random.randint(0, MAX_REPEAT))
        for _ in xrange(repeat): data.append(val)

    print data
    print [item for item in run_length_encode(data)]

输出：

[5, 5, 5, 9, 8, 8, 7, 5, 5, 5, 5, 5, 1, 7, 9, 16, 16, 16, 16, 16]
[(3, 5), 9, (2, 8), 7, (5, 5), 1, 7, 9, (5, 16)]

如果运行时间非常长，那么最好明确地计算每组中迭代的数量，而不是将它们变成列表并占用它的长度：

def ilen(iterable):
    """ return the number of items in an iterable """
    return sum(1 for _ in iterable)

def run_length_encode(data):
    for value, igroup in groupby(data):
#        repeat_count = len(list(igroup))
        repeat_count = ilen(igroup)
        yield value if repeat_count == 1 else repeat_count, value

由于数据值的范围相对较小，您可以（也）将它们编码为单字节字符值。

什么是更快速的压缩数据的方法？

2 个答案: