Question

我正在尝试转换YCbCr-file 从8 bpp到10 bpp。

到目前为止，我最好的方法仍然是慢一个数量级比最基本的天真C实现。

C中的朴素方法，大约8s运行。制作代码相反，将工作时间减少到1秒以内。

我很想知道可以获得什么样的表现从标准python处理二进制文件。示例文件是在CIF-resolution中，与1080p中的内容相比“小”。尽管我主要感兴趣，也可以随意添加numpy-advice 在标准的python中。

可以从

下载测试文件

http://trace.eas.asu.edu/yuv/foreman/foreman_cif.7z

sha1sum正确的10位输出是

c511dabc793383f7fd0ed69b4bb9b9f89ef73b84

蟒：

#!/usr/bin/env python

import array

f_in = 'foreman_cif.yuv'
f_out = 'py_10bpp.yuv'

def bytesfromfile(f):
    while True:
        raw = array.array('B')
        raw.fromstring(f.read(8192))
        if not raw:
            break
        yield raw

with open(f_in, 'rb') as fd_in, \
        open(f_out, 'wb') as fd_out:

    for byte in bytesfromfile(fd_in):
        data = []
        for i in byte:
            i <<= 2
            data.append(i & 0xff)
            data.append((i >> 8) & 0xff)

        fd_out.write(array.array('B', data).tostring())

天真的C-dito：

#include <stdio.h>
#include <stdlib.h>

int main(int argc, char** argv)
{
    int c;
    int d[2];

    FILE* fd_in;
    FILE* fd_out;

    fd_in = fopen("foreman_cif.yuv", "rb");
    fd_out = fopen("c_10bpp.yuv", "wb");

    while((c = fgetc(fd_in)) != EOF) {
        c <<= 2;
        d[0] = c & 0xff;
        d[1] = (c >> 8) & 0xff;

        fwrite(&d[0], 1, 1, fd_out);
        fwrite(&d[1], 1, 1, fd_out);
    }

    fclose(fd_in);
    fclose(fd_out);

    return EXIT_SUCCESS;
}

Answer 1

问题中的代码在我的计算机上25秒{ - 1}}秒 - numpy秒：

0.37

import numpy as np a_in = np.memmap('foreman_cif.yuv', mode='readonly') a_out = np.memmap('py_10bpp.yuv', mode='write', shape=2*len(a_in)) a_out[::2] = a_in << 2 a_out[1::2] = a_in >> 6 - cython秒：

0.20

from functools import partial import pyximport; pyximport.install() # pip install cython from bpp8to10 import convert # bpp8to10.pyx f_in = 'foreman_cif.yuv' f_out = 'py_10bpp.yuv' def main(): with open(f_in, 'rb') as fd_in, open(f_out, 'wb') as fd_out: for chunk in iter(partial(fd_in.read, 8192), b''): fd_out.write(convert(chunk)) main()：

bpp8to10.pyx

纯CPython版本的主要加速是将代码从模块级别移动到函数（from cpython.bytes cimport PyBytes_FromStringAndSize def convert(bytes chunk not None): cdef: bytes data = PyBytes_FromStringAndSize(NULL, len(chunk)*2) char* buf = data # no copy Py_ssize_t j = 0 unsigned char c for c in chunk: buf[j] = (c << 2) buf[j + 1] = (c >> 6) j += 2 return data） - main()秒（2个CPU）：

6.7

from functools import partial from multiprocessing import Pool f_in = 'foreman_cif.yuv' f_out = 'py_10bpp.yuv' def convert(chunk): data = bytearray() # [] -> bytearray(): 17 -> 15 seconds data_append = data.append # 15 -> 12 seconds for b in bytearray(chunk): # on Python 3: `for b in chunk:` data_append((b << 2) & 0xff) data_append((b >> 8) & 0xff) return data def main(): # put in main(): # 25 -> 17 seconds pool = Pool(processes=2) # 12 -> 6.7 seconds with open(f_in, 'rb') as fd_in, open(f_out, 'wb') as fd_out: for data in pool.imap(convert, iter(partial(fd_in.read, 8192), b'')): fd_out.write(data) main() - pypy秒：

1.6

处理二进制文件的python性能

1 个答案: