与C等效相比,减压程序非常慢?

时间:2015-11-07 04:27:05

标签: python c performance audio compression

我开始在C中编写一个简单的音频解压缩器。但现在我在几种不同的音频容器类型中遇到了相同的音频编码,并决定我想扩展解压缩器并做更多的事情。通用转换器",所以我转移到Python,因为我对它更加熟悉,并认为从长远来看我可能更容易使用它。在测试Python等价物时我注意到的第一件事是它与C版本相比要慢得多。

C版看起来像这样:

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>

uint8_t BYTES_PER_BLOCK   = 16;
uint8_t SAMPLES_PER_BLOCK = 28;
uint8_t FLAG_END          = 0b00000001;
uint8_t FLAG_LOOP_CONTEXT = 0b00000010;
uint8_t FLAG_LOOP_START   = 0b00000100;

double coeffs[5][2] = {
    {          0.0,          0.0 },
    {  60.0 / 64.0,          0.0 },
    { 115.0 / 64.0, -52.0 / 64.0 },
    {  98.0 / 64.0, -55.0 / 64.0 },
    { 122.0 / 64.0, -60.0 / 64.0 }
};

uint32_t filesize(FILE *f)
{
    uint32_t filesize, offset;
    offset = ftell(f);
    fseek(f, 0, SEEK_END);
    filesize = ftell(f);
    fseek(f, offset, SEEK_SET);
    return filesize;
}

int clamp_s16(int32_t val)
{
    if (val > 32767)
        return 32767;

    if (val < -32768)
        return -32768;

    return val;
}

void decompress_adpcm(uint8_t *cmpbuf, FILE *outfile, uint32_t blocks_to_do, int32_t hist1, int32_t hist2, int loops)
{
    int     block_num;
    int     sample_num;

    int     predict_nr;
    int     shift_factor;
    uint8_t flag;

    int32_t loop_start = -1;
    int     l;

    short   scale;
    short   sample_byte;
    int     sample;
    int16_t outbuf[1];

    for (block_num = 0; block_num < blocks_to_do; block_num++)
    {
        predict_nr   = cmpbuf[block_num * 16 + 0] >> 4;
        shift_factor = cmpbuf[block_num * 16 + 0] & 0x0F;
        flag         = cmpbuf[block_num * 16 + 1];

        if (flag & FLAG_LOOP_START)
        {
            if (flag & FLAG_LOOP_CONTEXT)
            {
                loop_start = block_num;
            }
        }

        for (sample_num = 0; sample_num < SAMPLES_PER_BLOCK; sample_num++)
        {
            sample = 0;

            if(flag < 0x07)
            {
                sample_byte = cmpbuf[(block_num * BYTES_PER_BLOCK) + 2 + (sample_num / 2)];

                scale = ((sample_num & 1 ? sample_byte >> 4 : sample_byte & 0x0F) << 12);

                sample = (int)((scale >> shift_factor) + (hist1 * coeffs[predict_nr][0]) + (hist2 * coeffs[predict_nr][1]));
            }

            outbuf[0] = clamp_s16(sample);
            fwrite(&outbuf[0], 2, 1, outfile);

            hist2 = hist1;
            hist1 = sample;
        }
    }

    if (loops > 0)
    {
        if (loop_start >= 0)
        {
            for (l=0; l<loops; l++)
            {
                decompress_adpcm(&cmpbuf[loop_start*16], outfile, blocks_to_do - loop_start, hist1, hist2, 0);
            }
        }
    }
}

int main()
{
    FILE *cmpfile = fopen("C:\\test.adpcm", "rb");
    uint32_t cmpsize = filesize(cmpfile);
    uint8_t *cmpbuf = calloc(1, cmpsize);
    fread(cmpbuf, cmpsize, 1, cmpfile);

    FILE *outfile = fopen("C:\\test_c.raw", "wb");

    decompress_adpcm(cmpbuf, outfile, cmpsize/16, 0, 0, 3);

    return 0;
}

Python版本如下所示:

import struct

BYTES_PER_BLOCK   = 16
SAMPLES_PER_BLOCK = 28
FLAG_END          = 0b00000001
FLAG_LOOP_CONTEXT = 0b00000010
FLAG_LOOP_START   = 0b00000100

coeffs = {
    0:    {0:          0.0,    1:          0.0},
    1:    {0:  60.0 / 64.0,    1:          0.0},
    2:    {0: 115.0 / 64.0,    1: -52.0 / 64.0},
    3:    {0:  98.0 / 64.0,    1: -55.0 / 64.0},
    4:    {0: 122.0 / 64.0,    1: -60.0 / 64.0}
}

s16_t = struct.Struct("<h")

def s32(n):
    return int(((n + 0x80000000) % 0x100000000) - 0x80000000)

def s16(n):
    return int(((n + 0x8000) % 0x10000) - 0x8000)

def put_s16_le(n):
    return s16_t.pack(n)

def clamp_s16(n):
    if n > 32767:
        return 32767

    if n < -32768:
        return -32768

    return n

def decompress_adpcm(cmpbuf, outfile, blocks_to_do, hist1=0, hist2=0, loops=0):

    loop_start = -1

    for block_num in range(blocks_to_do):

        predict_nr   = cmpbuf[(block_num * BYTES_PER_BLOCK) + 0] >> 4
        shift_factor = cmpbuf[(block_num * BYTES_PER_BLOCK) + 0] & 0x0F
        flag         = cmpbuf[(block_num * BYTES_PER_BLOCK) + 1]

        if flag & FLAG_LOOP_START:
            if flag & FLAG_LOOP_CONTEXT:
                loop_start = block_num

        for sample_num in range(SAMPLES_PER_BLOCK):

            sample = 0

            if flag < 0x07:

                adpcm_byte = cmpbuf[(block_num * BYTES_PER_BLOCK) + 2 + (sample_num//2)]

                if sample_num & 1:
                    scale = adpcm_byte >> 4
                else:
                    scale = adpcm_byte & 0x0F

                scale = s16(scale << 12)

                sample = s32((scale >> shift_factor) + (hist1 * coeffs[predict_nr][0]) + (hist2 * coeffs[predict_nr][1]))

            outfile.write( put_s16_le( clamp_s16(sample) ) )

            hist2 = hist1
            hist1 = sample

    if loops > 0:
        if loop_start >= 0:
            for l in range(loops):
                decompress_adpcm(cmpbuf[loop_start:loop_start + ((blocks_to_do - loop_start) * BYTES_PER_BLOCK)], outfile, hist1, hist2)

def main():
    with open(r"C:\test.adpcm", "rb") as cmpf:
        cmpbuf = cmpf.read()

    with open(r"C:\test_py.raw", "wb") as out:
        decompress_adpcm(cmpbuf, outf, len(cmpbuf)//BYTES_PER_BLOCK, loops=3)

    return 0

if __name__=="__main__":
    main()

这是我通过profile运行得到的结果:

         1647764 function calls (1647761 primitive calls) in 8.219 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    8.219    8.219 :0(exec)
        8    0.000    0.000    0.000    0.000 :0(len)
        2    0.000    0.000    0.000    0.000 :0(open)
   274624    0.344    0.000    0.344    0.000 :0(pack)
        1    0.000    0.000    0.000    0.000 :0(read)
        1    0.000    0.000    0.000    0.000 :0(setprofile)
   274624    1.234    0.000    1.234    0.000 :0(write)
        1    0.000    0.000    8.219    8.219 <string>:1(<module>)
   274624    0.625    0.000    0.625    0.000 test.py:105(s32)
   274624    0.734    0.000    0.734    0.000 test.py:108(s16)
   274624    0.875    0.000    1.219    0.000 test.py:111(put_s16_le)
   274624    0.266    0.000    0.266    0.000 test.py:114(clamp_s16)
      4/1    4.141    1.035    8.219    8.219 test.py:123(decompress_adpcm)
        1    0.000    0.000    8.219    8.219 test.py:178(main)
        1    0.000    0.000    8.219    8.219 profile:0(main())
        0    0.000             0.000          profile:0(profiler)

在我的机器上(Intel Core 2 Duo E8200 @ 2.67Ghz),每次测试运行时,C版本都需要不到一秒的时间才能完成执行,而Python版本需要大约8秒(如上所示)完成。我使用相同的音频文件测试两个版本,并且没有资源占用或后台的任何我知道的可能会以某种方式影响Python的性能。

现在,我看到有人建议像#34;如果你想要速度,请使用C&#34;所有的时间,我当然都同意,但是,最好的Python,它不应该比C慢得多吗?!我一直在尽力优化它,但我没有看到任何重大改进。我做的最后一个调整是为put_s16_le添加一个静态结构,它确实有所帮助,但仍然不是很多。

那么有没有办法优化Python版本,或者我在这里遇到一个缓慢的脚本?

如果重要,我使用的是Python 3.4.3。

1 个答案:

答案 0 :(得分:2)

哇,这是一个令人印象深刻的糟糕结果,慢八倍!

你必须这样想:对于你在python中执行的每个算术/按位操作,都有与C中相同的计算量,但是你得到一个python运行时,它可以确定你组合了哪些对象,实现了什么然后,运算符调用底层Python解释器实现的py_object,生成一个新对象来保存结果,然后分配它。

许多级别的间接必然会引入开销。所以,我真的很惊讶python实现只慢了八倍。这可能意味着你的C实现还有改进的余地。