Question

我正在尝试优化Reed-Solomon编码器，这实际上只是对Galois Fields 2 ^ 8的多项式除法运算（这意味着值环绕超过255）。事实上，代码与Go中的代码非常相似：http://research.swtch.com/field

这里使用的多项式除法算法是synthetic division（也称为Horner方法）。

我尝试了一切：numpy，pypy，cython。我得到的最佳性能是使用pypy和这个简单的嵌套循环：

def rsenc(msg_in, nsym, gen):
    '''Reed-Solomon encoding using polynomial division, better explained at http://research.swtch.com/field'''
    msg_out = bytearray(msg_in) + bytearray(len(gen)-1)
    lgen = bytearray([gf_log[gen[j]] for j in xrange(len(gen))])

    for i in xrange(len(msg_in)):
        coef = msg_out[i]
        # coef = gf_mul(msg_out[i], gf_inverse(gen[0]))  // for general polynomial division (when polynomials are non-monic), we need to compute: coef = msg_out[i] / gen[0]
        if coef != 0: # coef 0 is normally undefined so we manage it manually here (and it also serves as an optimization btw)
            lcoef = gf_log[coef] # precaching

            for j in xrange(1, len(gen)): # optimization: can skip g0 because the first coefficient of the generator is always 1! (that's why we start at position 1)
                msg_out[i + j] ^= gf_exp[lcoef + lgen[j]] # equivalent (in Galois Field 2^8) to msg_out[i+j] += msg_out[i] * gen[j]

    # Recopy the original message bytes
    msg_out[:len(msg_in)] = msg_in
    return msg_out

Python优化向导能否引导我了解如何获得加速的一些线索？我的目标是至少获得3倍的加速，但更多的将是非常棒的。任何方法或工具都可以接受，只要它是跨平台的（至少在Linux和Windows上工作）。

这是一个小测试脚本，其中包含我尝试的其他一些替代方法（因为它比原生python慢，所以不包括cython尝试！）：

import random
from operator import xor

numpy_enabled = False
try:
    import numpy as np
    numpy_enabled = True
except ImportError:
    pass

# Exponent table for 3, a generator for GF(256)
gf_exp = bytearray([1, 3, 5, 15, 17, 51, 85, 255, 26, 46, 114, 150, 161, 248, 19,
    53, 95, 225, 56, 72, 216, 115, 149, 164, 247, 2, 6, 10, 30, 34,
    102, 170, 229, 52, 92, 228, 55, 89, 235, 38, 106, 190, 217, 112,
    144, 171, 230, 49, 83, 245, 4, 12, 20, 60, 68, 204, 79, 209, 104,
    184, 211, 110, 178, 205, 76, 212, 103, 169, 224, 59, 77, 215, 98,
    166, 241, 8, 24, 40, 120, 136, 131, 158, 185, 208, 107, 189, 220,
    127, 129, 152, 179, 206, 73, 219, 118, 154, 181, 196, 87, 249, 16,
    48, 80, 240, 11, 29, 39, 105, 187, 214, 97, 163, 254, 25, 43, 125,
    135, 146, 173, 236, 47, 113, 147, 174, 233, 32, 96, 160, 251, 22,
    58, 78, 210, 109, 183, 194, 93, 231, 50, 86, 250, 21, 63, 65, 195,
    94, 226, 61, 71, 201, 64, 192, 91, 237, 44, 116, 156, 191, 218,
    117, 159, 186, 213, 100, 172, 239, 42, 126, 130, 157, 188, 223,
    122, 142, 137, 128, 155, 182, 193, 88, 232, 35, 101, 175, 234, 37,
    111, 177, 200, 67, 197, 84, 252, 31, 33, 99, 165, 244, 7, 9, 27,
    45, 119, 153, 176, 203, 70, 202, 69, 207, 74, 222, 121, 139, 134,
    145, 168, 227, 62, 66, 198, 81, 243, 14, 18, 54, 90, 238, 41, 123,
    141, 140, 143, 138, 133, 148, 167, 242, 13, 23, 57, 75, 221, 124,
    132, 151, 162, 253, 28, 36, 108, 180, 199, 82, 246] * 2 + [1])

# Logarithm table, base 3
gf_log = bytearray([0, 0, 25, 1, 50, 2, 26, 198, 75, 199, 27, 104, 51, 238, 223, # BEWARE: the first entry should be None instead of 0 because it's undefined, but for a bytearray we can't set such a value
        3, 100, 4, 224, 14, 52, 141, 129, 239, 76, 113, 8, 200, 248, 105,
        28, 193, 125, 194, 29, 181, 249, 185, 39, 106, 77, 228, 166, 114,
        154, 201, 9, 120, 101, 47, 138, 5, 33, 15, 225, 36, 18, 240, 130,
        69, 53, 147, 218, 142, 150, 143, 219, 189, 54, 208, 206, 148, 19,
        92, 210, 241, 64, 70, 131, 56, 102, 221, 253, 48, 191, 6, 139, 98,
        179, 37, 226, 152, 34, 136, 145, 16, 126, 110, 72, 195, 163, 182,
        30, 66, 58, 107, 40, 84, 250, 133, 61, 186, 43, 121, 10, 21, 155,
        159, 94, 202, 78, 212, 172, 229, 243, 115, 167, 87, 175, 88, 168,
        80, 244, 234, 214, 116, 79, 174, 233, 213, 231, 230, 173, 232, 44,
        215, 117, 122, 235, 22, 11, 245, 89, 203, 95, 176, 156, 169, 81,
        160, 127, 12, 246, 111, 23, 196, 73, 236, 216, 67, 31, 45, 164,
        118, 123, 183, 204, 187, 62, 90, 251, 96, 177, 134, 59, 82, 161,
        108, 170, 85, 41, 157, 151, 178, 135, 144, 97, 190, 220, 252, 188,
        149, 207, 205, 55, 63, 91, 209, 83, 57, 132, 60, 65, 162, 109, 71,
        20, 42, 158, 93, 86, 242, 211, 171, 68, 17, 146, 217, 35, 32, 46,
        137, 180, 124, 184, 38, 119, 153, 227, 165, 103, 74, 237, 222, 197,
        49, 254, 24, 13, 99, 140, 128, 192, 247, 112, 7])

if numpy_enabled:
    np_gf_exp = np.array(gf_exp)
    np_gf_log = np.array(gf_log)

def gf_pow(x, power):
    return gf_exp[(gf_log[x] * power) % 255]

def gf_poly_mul(p, q):
    r = [0] * (len(p) + len(q) - 1)
    lp = [gf_log[p[i]] for i in xrange(len(p))]
    for j in range(len(q)):
        lq = gf_log[q[j]]
        for i in range(len(p)):
            r[i + j] ^= gf_exp[lp[i] + lq]
    return r

def rs_generator_poly_base3(nsize, fcr=0):
    g_all = {}
    g = [1]
    g_all[0] = g_all[1] = g
    for i in range(fcr+1, fcr+nsize+1):
        g = gf_poly_mul(g, [1, gf_pow(3, i)])
        g_all[nsize-i] = g
    return g_all

# Fastest way with pypy
def rsenc(msg_in, nsym, gen):
    '''Reed-Solomon encoding using polynomial division, better explained at http://research.swtch.com/field'''
    msg_out = bytearray(msg_in) + bytearray(len(gen)-1)
    lgen = bytearray([gf_log[gen[j]] for j in xrange(len(gen))])

    for i in xrange(len(msg_in)):
        coef = msg_out[i]
        # coef = gf_mul(msg_out[i], gf_inverse(gen[0]))  # for general polynomial division (when polynomials are non-monic), the usual way of using synthetic division is to divide the divisor g(x) with its leading coefficient (call it a). In this implementation, this means:we need to compute: coef = msg_out[i] / gen[0]
        if coef != 0: # coef 0 is normally undefined so we manage it manually here (and it also serves as an optimization btw)
            lcoef = gf_log[coef] # precaching

            for j in xrange(1, len(gen)): # optimization: can skip g0 because the first coefficient of the generator is always 1! (that's why we start at position 1)
                msg_out[i + j] ^= gf_exp[lcoef + lgen[j]] # equivalent (in Galois Field 2^8) to msg_out[i+j] += msg_out[i] * gen[j]

    # Recopy the original message bytes
    msg_out[:len(msg_in)] = msg_in
    return msg_out

# Alternative 1: the loops were completely changed, instead of fixing msg_out[i] and updating all subsequent i+j items, we now fixate msg_out[i+j]  and compute it at once using all couples msg_out[i] * gen[j] - msg_out[i+1] * gen[j-1] - ... since when we fixate msg_out[i+j], all previous msg_out[k] with k < i+j are already fully computed.
def rsenc_alt1(msg_in, nsym, gen):
    msg_in = bytearray(msg_in)
    msg_out = bytearray(msg_in) + bytearray(len(gen)-1)
    lgen = bytearray([gf_log[gen[j]] for j in xrange(len(gen))])

    # Alternative 1
    jlist = range(1, len(gen))
    for k in xrange(1, len(msg_out)):
        for x in xrange(max(k-len(msg_in),0), len(gen)-1):
            if k-x-1 < 0: break
            msg_out[k] ^=  gf_exp[msg_out[k-x-1] + lgen[jlist[x]]]

    # Recopy the original message bytes
    msg_out[:len(msg_in)] = msg_in
    return msg_out

# Alternative 2: a rewrite of alternative 1 with generators and reduce
def rsenc_alt2(msg_in, nsym, gen):
    msg_in = bytearray(msg_in)
    msg_out = bytearray(msg_in) + bytearray(len(gen)-1)
    lgen = bytearray([gf_log[gen[j]] for j in xrange(len(gen))])

    # Alternative 1
    jlist = range(1, len(gen))
    for k in xrange(1, len(msg_out)):
        items_gen = ( gf_exp[msg_out[k-x-1] + lgen[jlist[x]]] if k-x-1 >= 0 else next(iter(())) for x in xrange(max(k-len(msg_in),0), len(gen)-1) )
        msg_out[k] ^= reduce(xor, items_gen)

    # Recopy the original message bytes
    msg_out[:len(msg_in)] = msg_in
    return msg_out

# Alternative with Numpy
def rsenc_numpy(msg_in, nsym, gen):
    msg_in = np.array(bytearray(msg_in))
    msg_out = np.pad(msg_in, (0, nsym), 'constant')
    lgen = np_gf_log[gen]
    for i in xrange(msg_in.size):
        msg_out[i+1:i+lgen.size] ^= np_gf_exp[np.add(lgen[1:], msg_out[i])]

    msg_out[:len(msg_in)] = msg_in
    return msg_out

gf_mul_arr = [bytearray(256) for _ in xrange(256)]
gf_add_arr = [bytearray(256) for _ in xrange(256)]

# Precompute multiplication and addition tables
def gf_precomp_tables(gf_exp=gf_exp, gf_log=gf_log):
    global gf_mul_arr, gf_add_arr
    for i in xrange(256):
        for j in xrange(256):
            gf_mul_arr[i][j] = gf_exp[gf_log[i] + gf_log[j]]
            gf_add_arr[i][j] = i ^ j
    return gf_mul_arr, gf_add_arr

# Alternative with precomputation of multiplication and addition tables, inspired by zfec: https://hackage.haskell.org/package/fec-0.1.1/src/zfec/fec.c
def rsenc_precomp(msg_in, nsym, gen=None):
    msg_in = bytearray(msg_in)
    msg_out = bytearray(msg_in) + bytearray(len(gen)-1)

    for i in xrange(len(msg_in)): # [i for i in xrange(len(msg_in)) if msg_in[i] != 0]
        coef = msg_out[i]
        if coef != 0:  # coef 0 is normally undefined so we manage it manually here (and it also serves as an optimization btw)
            mula = gf_mul_arr[coef]
            for j in xrange(1, len(gen)): # optimization: can skip g0 because the first coefficient of the generator is always 1! (that's why we start at position 1)
                #msg_out[i + j] = gf_add_arr[msg_out[i+j]][gf_mul_arr[coef][gen[j]]] # slower...
                #msg_out[i + j] ^= gf_mul_arr[coef][gen[j]] # faster
                msg_out[i + j] ^= mula[gen[j]] # fastest

    # Recopy the original message bytes
    msg_out[:len(msg_in)] = msg_in # equivalent to c = mprime - b, where mprime is msg_in padded with [0]*nsym
    return msg_out

def randstr(n, size):
    '''Generate very fastly a random hexadecimal string. Kudos to jcdryer http://stackoverflow.com/users/131084/jcdyer'''
    hexstr = '%0'+str(size)+'x'
    for _ in xrange(n):
        yield hexstr % random.randrange(16**size)

# Simple test case
if __name__ == "__main__":

    # Setup functions to test
    funcs = [rsenc, rsenc_precomp, rsenc_alt1, rsenc_alt2]
    if numpy_enabled: funcs.append(rsenc_numpy)
    gf_precomp_tables()

    # Setup RS vars
    n = 255
    k = 213

    import time

    # Init the generator polynomial
    g = rs_generator_poly_base3(n)

    # Init the ground truth
    mes = 'hello world'
    mesecc_correct = rsenc(mes, n-11, g[k])

    # Test the functions
    for func in funcs:
        # Sanity check
        if func(mes, n-11, g[k]) != mesecc_correct: print func.__name__, ": output is incorrect!"
        # Time the function
        total_time = 0
        for m in randstr(1000, n):
            start = time.clock()
            func(m, n-k, g[k])
            total_time += time.clock() - start
        print func.__name__, ": total time elapsed %f seconds." % total_time

这是我机器上的结果：

With PyPy:
rsenc : total time elapsed 0.108183 seconds.
rsenc_alt1 : output is incorrect!
rsenc_alt1 : total time elapsed 0.164084 seconds.
rsenc_alt2 : output is incorrect!
rsenc_alt2 : total time elapsed 0.557697 seconds.

Without PyPy:
rsenc : total time elapsed 3.518857 seconds.
rsenc_alt1 : output is incorrect!
rsenc_alt1 : total time elapsed 5.630897 seconds.
rsenc_alt2 : output is incorrect!
rsenc_alt2 : total time elapsed 6.100434 seconds.
rsenc_numpy : output is incorrect!
rsenc_numpy : total time elapsed 1.631373 seconds

（注意：替代方案应该是正确的，某些索引必须有点关闭，但因为它们反正慢，所以我没有尝试修复它们）

/ UPDATE和赏金的目标：我发现了一个非常有趣的优化技巧，它有望加速计算：precompute the multiplication table。我使用新函数rsenc_precomp（）更新了上面的代码。但是，在我的实现中根本没有任何收益，它甚至有点慢：

rsenc : total time elapsed 0.107170 seconds.
rsenc_precomp : total time elapsed 0.108788 seconds.

如果数组查找的成本高于添加或xor等操作，那怎么可能？为什么它在ZFEC中有效而不在Python中？

我会将赏金归功于能告诉我如何使这种乘法/加法查找表优化工作的人（比xor和加法运算更快），或者谁可以通过引用或分析向我解释为什么这个优化在这里不起作用（使用Python / PyPy / Cython / Numpy等。我全部尝试过。）

Answer 1

以下比我机器上的pypy快3倍（0.04s vs 0.15s）。使用Cython：

<body ng-app="myApp">

<table ng-controller="myCtrl">
    <tbody>
        <tr>
            <td style="text-align:left; padding-left:12px;">
                Select A Value
            </td>
        </tr>
        <tr>
            <td>
                <select id="soflow" ng-model="selectedItem" ng-change="selectedItemChanged(rec.Rezept_Nr)">
                    <option ng-repeat="rec in recipies" value="{{rec.Rezept_Nr}}">{{rec.Rezept_Bez}}</option>
                </select>
            </td>
        <tr>
        <tr>
            <td>Gesamtweg des Gießkolbens</td>
            <td >
                <input type="text" name="textfield" id="textfield" value="{{castVals.gesamtweg}}"> 
            </td>
        </tr>
    </tbody>

这是您使用静态类型的最快版本（并检查来自ctypedef unsigned char uint8_t # does not work with Microsoft's C Compiler: from libc.stdint cimport uint8_t cimport cpython.array as array cdef uint8_t[::1] gf_exp = bytearray([1, 3, 5, 15, 17, 51, 85, 255, 26, 46, 114, 150, 161, 248, 19, lots of numbers omitted for space reasons ...]) cdef uint8_t[::1] gf_log = bytearray([0, 0, 25, 1, 50, 2, 26, 198, 75, 199, 27, 104, more numbers omitted for space reasons ...]) import cython @cython.boundscheck(False) @cython.wraparound(False) @cython.initializedcheck(False) def rsenc(msg_in_r, nsym, gen_t): '''Reed-Solomon encoding using polynomial division, better explained at http://research.swtch.com/field''' cdef uint8_t[::1] msg_in = bytearray(msg_in_r) # have to copy, unfortunately - can't make a memory view from a read only object cdef int[::1] gen = array.array('i',gen_t) # convert list to array cdef uint8_t[::1] msg_out = bytearray(msg_in) + bytearray(len(gen)-1) cdef int j cdef uint8_t[::1] lgen = bytearray(gen.shape[0]) for j in xrange(gen.shape[0]): lgen[j] = gf_log[gen[j]] cdef uint8_t coef,lcoef cdef int i for i in xrange(msg_in.shape[0]): coef = msg_out[i] if coef != 0: # coef 0 is normally undefined so we manage it manually here (and it also serves as an optimization btw) lcoef = gf_log[coef] # precaching for j in xrange(1, gen.shape[0]): # optimization: can skip g0 because the first coefficient of the generator is always 1! (that's why we start at position 1) msg_out[i + j] ^= gf_exp[lcoef + lgen[j]] # equivalent (in Galois Field 2^8) to msg_out[i+j] -= msg_out[i] * gen[j] # Recopy the original message bytes msg_out[:msg_in.shape[0]] = msg_in return msg_out的html，直到循环没有以黄色突出显示）。

一些简短的说明：

Cython喜欢cython -a到x.shape[0]
将记忆视图定义为len(shape)承诺它们在内存中是连续的，这有助于
[::1]是必要的，可以避免对全局定义的initializedcheck(False)和gf_exp进行大量的存在检查。（您可能会发现通过为这些代码创建局部变量引用并使用它来加速您的基本Python / PyPy代码）
我不得不复制几个输入参数。 Cython不能从只读对象（在这种情况下是gf_log，一个字符串中创建一个内存视图。我可能只是把它变成了char *）。此外，msg_in（列表）需要具有快速元素访问权限。

除此之外，它都非常简单。（我没有尝试过任何变化，但速度更快）。我对PyPy的表现印象非常深刻。

Answer 2

或者，如果你知道C，我建议在普通的C中重写这个Python函数并调用它（比如用CFFI）。至少你知道你在函数的内部循环中达到顶级性能，而不需要知道PyPy或Cython技巧。

请参阅：http://cffi.readthedocs.org/en/latest/overview.html#performance

优化reed-solomon编码器（多项式除法）

2 个答案: