我正在尝试转换YCbCr-file 从8 bpp到10 bpp。
到目前为止,我最好的方法仍然是慢一个数量级 比最基本的天真C实现。
C中的朴素方法,大约8s运行。制作代码 相反,将工作时间减少到1秒以内。
我很想知道可以获得什么样的表现 从标准python处理二进制文件。示例文件是 在CIF-resolution中,与1080p中的内容相比“小”。 尽管我主要感兴趣,也可以随意添加numpy-advice 在标准的python中。
可以从
下载测试文件http://trace.eas.asu.edu/yuv/foreman/foreman_cif.7z
sha1sum
正确的10位输出是
c511dabc793383f7fd0ed69b4bb9b9f89ef73b84
蟒:
#!/usr/bin/env python
import array
f_in = 'foreman_cif.yuv'
f_out = 'py_10bpp.yuv'
def bytesfromfile(f):
while True:
raw = array.array('B')
raw.fromstring(f.read(8192))
if not raw:
break
yield raw
with open(f_in, 'rb') as fd_in, \
open(f_out, 'wb') as fd_out:
for byte in bytesfromfile(fd_in):
data = []
for i in byte:
i <<= 2
data.append(i & 0xff)
data.append((i >> 8) & 0xff)
fd_out.write(array.array('B', data).tostring())
天真的C-dito:
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char** argv)
{
int c;
int d[2];
FILE* fd_in;
FILE* fd_out;
fd_in = fopen("foreman_cif.yuv", "rb");
fd_out = fopen("c_10bpp.yuv", "wb");
while((c = fgetc(fd_in)) != EOF) {
c <<= 2;
d[0] = c & 0xff;
d[1] = (c >> 8) & 0xff;
fwrite(&d[0], 1, 1, fd_out);
fwrite(&d[1], 1, 1, fd_out);
}
fclose(fd_in);
fclose(fd_out);
return EXIT_SUCCESS;
}
答案 0 :(得分:4)
问题中的代码在我的计算机上25
秒{ - 1}}秒 - numpy
秒:
0.37
import numpy as np
a_in = np.memmap('foreman_cif.yuv', mode='readonly')
a_out = np.memmap('py_10bpp.yuv', mode='write', shape=2*len(a_in))
a_out[::2] = a_in << 2
a_out[1::2] = a_in >> 6
- cython
秒:
0.20
from functools import partial
import pyximport; pyximport.install() # pip install cython
from bpp8to10 import convert # bpp8to10.pyx
f_in = 'foreman_cif.yuv'
f_out = 'py_10bpp.yuv'
def main():
with open(f_in, 'rb') as fd_in, open(f_out, 'wb') as fd_out:
for chunk in iter(partial(fd_in.read, 8192), b''):
fd_out.write(convert(chunk))
main()
:
bpp8to10.pyx
纯CPython版本的主要加速是将代码从模块级别移动到函数(from cpython.bytes cimport PyBytes_FromStringAndSize
def convert(bytes chunk not None):
cdef:
bytes data = PyBytes_FromStringAndSize(NULL, len(chunk)*2)
char* buf = data # no copy
Py_ssize_t j = 0
unsigned char c
for c in chunk:
buf[j] = (c << 2)
buf[j + 1] = (c >> 6)
j += 2
return data
) - main()
秒(2个CPU):
6.7
from functools import partial
from multiprocessing import Pool
f_in = 'foreman_cif.yuv'
f_out = 'py_10bpp.yuv'
def convert(chunk):
data = bytearray() # [] -> bytearray(): 17 -> 15 seconds
data_append = data.append # 15 -> 12 seconds
for b in bytearray(chunk): # on Python 3: `for b in chunk:`
data_append((b << 2) & 0xff)
data_append((b >> 8) & 0xff)
return data
def main(): # put in main(): # 25 -> 17 seconds
pool = Pool(processes=2) # 12 -> 6.7 seconds
with open(f_in, 'rb') as fd_in, open(f_out, 'wb') as fd_out:
for data in pool.imap(convert, iter(partial(fd_in.read, 8192), b'')):
fd_out.write(data)
main()
- pypy
秒:
1.6