我开始在C中编写一个简单的音频解压缩器。但现在我在几种不同的音频容器类型中遇到了相同的音频编码,并决定我想扩展解压缩器并做更多的事情。通用转换器",所以我转移到Python,因为我对它更加熟悉,并认为从长远来看我可能更容易使用它。在测试Python等价物时我注意到的第一件事是它与C版本相比要慢得多。
C版看起来像这样:
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
uint8_t BYTES_PER_BLOCK = 16;
uint8_t SAMPLES_PER_BLOCK = 28;
uint8_t FLAG_END = 0b00000001;
uint8_t FLAG_LOOP_CONTEXT = 0b00000010;
uint8_t FLAG_LOOP_START = 0b00000100;
double coeffs[5][2] = {
{ 0.0, 0.0 },
{ 60.0 / 64.0, 0.0 },
{ 115.0 / 64.0, -52.0 / 64.0 },
{ 98.0 / 64.0, -55.0 / 64.0 },
{ 122.0 / 64.0, -60.0 / 64.0 }
};
uint32_t filesize(FILE *f)
{
uint32_t filesize, offset;
offset = ftell(f);
fseek(f, 0, SEEK_END);
filesize = ftell(f);
fseek(f, offset, SEEK_SET);
return filesize;
}
int clamp_s16(int32_t val)
{
if (val > 32767)
return 32767;
if (val < -32768)
return -32768;
return val;
}
void decompress_adpcm(uint8_t *cmpbuf, FILE *outfile, uint32_t blocks_to_do, int32_t hist1, int32_t hist2, int loops)
{
int block_num;
int sample_num;
int predict_nr;
int shift_factor;
uint8_t flag;
int32_t loop_start = -1;
int l;
short scale;
short sample_byte;
int sample;
int16_t outbuf[1];
for (block_num = 0; block_num < blocks_to_do; block_num++)
{
predict_nr = cmpbuf[block_num * 16 + 0] >> 4;
shift_factor = cmpbuf[block_num * 16 + 0] & 0x0F;
flag = cmpbuf[block_num * 16 + 1];
if (flag & FLAG_LOOP_START)
{
if (flag & FLAG_LOOP_CONTEXT)
{
loop_start = block_num;
}
}
for (sample_num = 0; sample_num < SAMPLES_PER_BLOCK; sample_num++)
{
sample = 0;
if(flag < 0x07)
{
sample_byte = cmpbuf[(block_num * BYTES_PER_BLOCK) + 2 + (sample_num / 2)];
scale = ((sample_num & 1 ? sample_byte >> 4 : sample_byte & 0x0F) << 12);
sample = (int)((scale >> shift_factor) + (hist1 * coeffs[predict_nr][0]) + (hist2 * coeffs[predict_nr][1]));
}
outbuf[0] = clamp_s16(sample);
fwrite(&outbuf[0], 2, 1, outfile);
hist2 = hist1;
hist1 = sample;
}
}
if (loops > 0)
{
if (loop_start >= 0)
{
for (l=0; l<loops; l++)
{
decompress_adpcm(&cmpbuf[loop_start*16], outfile, blocks_to_do - loop_start, hist1, hist2, 0);
}
}
}
}
int main()
{
FILE *cmpfile = fopen("C:\\test.adpcm", "rb");
uint32_t cmpsize = filesize(cmpfile);
uint8_t *cmpbuf = calloc(1, cmpsize);
fread(cmpbuf, cmpsize, 1, cmpfile);
FILE *outfile = fopen("C:\\test_c.raw", "wb");
decompress_adpcm(cmpbuf, outfile, cmpsize/16, 0, 0, 3);
return 0;
}
Python版本如下所示:
import struct
BYTES_PER_BLOCK = 16
SAMPLES_PER_BLOCK = 28
FLAG_END = 0b00000001
FLAG_LOOP_CONTEXT = 0b00000010
FLAG_LOOP_START = 0b00000100
coeffs = {
0: {0: 0.0, 1: 0.0},
1: {0: 60.0 / 64.0, 1: 0.0},
2: {0: 115.0 / 64.0, 1: -52.0 / 64.0},
3: {0: 98.0 / 64.0, 1: -55.0 / 64.0},
4: {0: 122.0 / 64.0, 1: -60.0 / 64.0}
}
s16_t = struct.Struct("<h")
def s32(n):
return int(((n + 0x80000000) % 0x100000000) - 0x80000000)
def s16(n):
return int(((n + 0x8000) % 0x10000) - 0x8000)
def put_s16_le(n):
return s16_t.pack(n)
def clamp_s16(n):
if n > 32767:
return 32767
if n < -32768:
return -32768
return n
def decompress_adpcm(cmpbuf, outfile, blocks_to_do, hist1=0, hist2=0, loops=0):
loop_start = -1
for block_num in range(blocks_to_do):
predict_nr = cmpbuf[(block_num * BYTES_PER_BLOCK) + 0] >> 4
shift_factor = cmpbuf[(block_num * BYTES_PER_BLOCK) + 0] & 0x0F
flag = cmpbuf[(block_num * BYTES_PER_BLOCK) + 1]
if flag & FLAG_LOOP_START:
if flag & FLAG_LOOP_CONTEXT:
loop_start = block_num
for sample_num in range(SAMPLES_PER_BLOCK):
sample = 0
if flag < 0x07:
adpcm_byte = cmpbuf[(block_num * BYTES_PER_BLOCK) + 2 + (sample_num//2)]
if sample_num & 1:
scale = adpcm_byte >> 4
else:
scale = adpcm_byte & 0x0F
scale = s16(scale << 12)
sample = s32((scale >> shift_factor) + (hist1 * coeffs[predict_nr][0]) + (hist2 * coeffs[predict_nr][1]))
outfile.write( put_s16_le( clamp_s16(sample) ) )
hist2 = hist1
hist1 = sample
if loops > 0:
if loop_start >= 0:
for l in range(loops):
decompress_adpcm(cmpbuf[loop_start:loop_start + ((blocks_to_do - loop_start) * BYTES_PER_BLOCK)], outfile, hist1, hist2)
def main():
with open(r"C:\test.adpcm", "rb") as cmpf:
cmpbuf = cmpf.read()
with open(r"C:\test_py.raw", "wb") as out:
decompress_adpcm(cmpbuf, outf, len(cmpbuf)//BYTES_PER_BLOCK, loops=3)
return 0
if __name__=="__main__":
main()
这是我通过profile
运行得到的结果:
1647764 function calls (1647761 primitive calls) in 8.219 seconds
Ordered by: standard name
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.000 0.000 8.219 8.219 :0(exec)
8 0.000 0.000 0.000 0.000 :0(len)
2 0.000 0.000 0.000 0.000 :0(open)
274624 0.344 0.000 0.344 0.000 :0(pack)
1 0.000 0.000 0.000 0.000 :0(read)
1 0.000 0.000 0.000 0.000 :0(setprofile)
274624 1.234 0.000 1.234 0.000 :0(write)
1 0.000 0.000 8.219 8.219 <string>:1(<module>)
274624 0.625 0.000 0.625 0.000 test.py:105(s32)
274624 0.734 0.000 0.734 0.000 test.py:108(s16)
274624 0.875 0.000 1.219 0.000 test.py:111(put_s16_le)
274624 0.266 0.000 0.266 0.000 test.py:114(clamp_s16)
4/1 4.141 1.035 8.219 8.219 test.py:123(decompress_adpcm)
1 0.000 0.000 8.219 8.219 test.py:178(main)
1 0.000 0.000 8.219 8.219 profile:0(main())
0 0.000 0.000 profile:0(profiler)
在我的机器上(Intel Core 2 Duo E8200 @ 2.67Ghz),每次测试运行时,C版本都需要不到一秒的时间才能完成执行,而Python版本需要大约8秒(如上所示)完成。我使用相同的音频文件测试两个版本,并且没有资源占用或后台的任何我知道的可能会以某种方式影响Python的性能。
现在,我看到有人建议像#34;如果你想要速度,请使用C&#34;所有的时间,我当然都同意,但是,最好的Python,它不应该比C慢得多吗?!我一直在尽力优化它,但我没有看到任何重大改进。我做的最后一个调整是为put_s16_le
添加一个静态结构,它确实有所帮助,但仍然不是很多。
那么有没有办法优化Python版本,或者我在这里遇到一个缓慢的脚本?
如果重要,我使用的是Python 3.4.3。
答案 0 :(得分:2)
你必须这样想:对于你在python中执行的每个算术/按位操作,都有与C中相同的计算量,但是你得到一个python运行时,它可以确定你组合了哪些对象,实现了什么然后,运算符调用底层Python解释器实现的py_object,生成一个新对象来保存结果,然后分配它。
许多级别的间接必然会引入开销。所以,我真的很惊讶python实现只慢了八倍。这可能意味着你的C实现还有改进的余地。