Question

我有一个CRC计算函数，其内部循环中包含以下内容：

if (uMsgByte & 0x80) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x40) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x20) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x10) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x08) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x04) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x02) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x01) crc ^= *pChkTableOffset; pChkTableOffset++;

分析显示，这些陈述花费了大量时间。我想知道我是否可以通过用“黑客攻击”取代条件来获得收益。我尝试了以下方法，但没有提高速度：

crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x80) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x40) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x20) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x10) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x08) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x04) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x02) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x01) - 1);

在最近的x86 CPU上这应该更快还是有更好的方法来实现这些＆＃39; bit hacks＆＃39;？

Answer 1

我无法确定哪个更快，但它们肯定是不同的 - 这更快取决于确切地使用哪个处理器品牌和型号，因为它们在[可能不可预测的]分支上表现不同。并且为了使事情进一步复杂化，不同的处理器对于依赖计算具有不同的行为＆＃34;。

我将发布的代码转换为此代码（这使得生成的代码大约一半长，但在概念级别上相同）：

int func1(int uMsgByte, char* pChkTableOffset)
{
    int crc = 0;
    if (uMsgByte & 0x80) crc ^= *pChkTableOffset; pChkTableOffset++;
    if (uMsgByte & 0x40) crc ^= *pChkTableOffset; pChkTableOffset++;
    if (uMsgByte & 0x20) crc ^= *pChkTableOffset; pChkTableOffset++;
    if (uMsgByte & 0x10) crc ^= *pChkTableOffset; pChkTableOffset++;

    return crc;
}


int func2(int uMsgByte, char* pChkTableOffset)
{
    int crc = 0;

    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x80) - 1);
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x40) - 1);
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x20) - 1);
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x10) - 1);

    return crc;
}

使用clang++ -S -O2编译：

<强> func1的：

_Z5func1jPh:                            # @_Z5func1jPh
        xorl    %eax, %eax
        testb   %dil, %dil
        jns     .LBB0_2
        movzbl  (%rsi), %eax
.LBB0_2:                                # %if.end
        testb   $64, %dil
        je      .LBB0_4
        movzbl  1(%rsi), %ecx
        xorl    %ecx, %eax
.LBB0_4:                                # %if.end.6
        testb   $32, %dil
        je      .LBB0_6
        movzbl  2(%rsi), %ecx
        xorl    %ecx, %eax
.LBB0_6:                                # %if.end.13
        testb   $16, %dil
        je      .LBB0_8
        movzbl  3(%rsi), %ecx
        xorl    %ecx, %eax
.LBB0_8:                                # %if.end.20
        retq

<强> FUNC2：

_Z5func2jPh:                            # @_Z5func2jPh
        movzbl  (%rsi), %eax
        movl    %edi, %ecx
        shll    $24, %ecx
        sarl    $31, %ecx
        andl    %eax, %ecx
        movzbl  1(%rsi), %eax
        movl    %edi, %edx
        shll    $25, %edx
        sarl    $31, %edx
        andl    %edx, %eax
        xorl    %ecx, %eax
        movzbl  2(%rsi), %ecx
        movl    %edi, %edx
        shll    $26, %edx
        sarl    $31, %edx
        andl    %ecx, %edx
        movzbl  3(%rsi), %ecx
        shll    $27, %edi
        sarl    $31, %edi
        andl    %ecx, %edi
        xorl    %edx, %edi
        xorl    %edi, %eax
        retq

正如您所看到的，编译器为第一个版本生成分支，并在第二个版本上使用逻辑运算 - 每个案例更多。

我可以编写一些代码来对每个循环进行基准测试，但我保证不同版本的x86处理器之间的结果会有很大差异。

我不确定这是否是一个常见的CRC计算，但是大多数CRC计算都有优化版本，这些版本使用表格和其他“聪明的东西”来快速执行正确的计算。

Answer 2

有兴趣了解一个人是否可以击败优化编译器，我用两种方式编写了算法：

在这里，您表达的意图就像您在编写机器代码一样

std::uint32_t foo1(std::uint8_t uMsgByte, 
                   std::uint32_t crc, 
                   const std::uint32_t* pChkTableOffset)
{
    if (uMsgByte & 0x80) crc ^= *pChkTableOffset; pChkTableOffset++;
    if (uMsgByte & 0x40) crc ^= *pChkTableOffset; pChkTableOffset++;
    if (uMsgByte & 0x20) crc ^= *pChkTableOffset; pChkTableOffset++;
    if (uMsgByte & 0x10) crc ^= *pChkTableOffset; pChkTableOffset++;
    if (uMsgByte & 0x08) crc ^= *pChkTableOffset; pChkTableOffset++;
    if (uMsgByte & 0x04) crc ^= *pChkTableOffset; pChkTableOffset++;
    if (uMsgByte & 0x02) crc ^= *pChkTableOffset; pChkTableOffset++;
    if (uMsgByte & 0x01) crc ^= *pChkTableOffset; pChkTableOffset++;

    return crc;
}

在这里，我以更加算法的方式表达意图......

std::uint32_t foo2(std::uint8_t uMsgByte, 
                   std::uint32_t crc, 
                   const std::uint32_t* pChkTableOffset)
{
    for (int i = 0 ; i < 7 ; ++i) {
        if (uMsgByte & (0x01 << (7-i)))
            crc ^= pChkTableOffset[i];

    }
    return crc;
}

然后我用g ++ -O3编译，结果是......

两个函数中完全相同的对象代码

故事的道德：选择正确的算法，避免重复，编写优雅的代码，让优化者做自己的事情。

这是证据：

__Z4foo1hjPKj:                          ## @_Z4foo1hjPKj
    .cfi_startproc
## BB#0:
    pushq   %rbp
Ltmp0:
    .cfi_def_cfa_offset 16
Ltmp1:
    .cfi_offset %rbp, -16
    movq    %rsp, %rbp
Ltmp2:
    .cfi_def_cfa_register %rbp
    testb   $-128, %dil
    je  LBB0_2
## BB#1:
    xorl    (%rdx), %esi
LBB0_2:
    testb   $64, %dil
    je  LBB0_4
## BB#3:
    xorl    4(%rdx), %esi
LBB0_4:
    testb   $32, %dil
    je  LBB0_6
## BB#5:
    xorl    8(%rdx), %esi
LBB0_6:
    testb   $16, %dil
    je  LBB0_8
## BB#7:
    xorl    12(%rdx), %esi
LBB0_8:
    testb   $8, %dil
    je  LBB0_10
## BB#9:
    xorl    16(%rdx), %esi
LBB0_10:
    testb   $4, %dil
    je  LBB0_12
## BB#11:
    xorl    20(%rdx), %esi
LBB0_12:
    testb   $2, %dil
    je  LBB0_14
## BB#13:
    xorl    24(%rdx), %esi
LBB0_14:
    testb   $1, %dil
    je  LBB0_16
## BB#15:
    xorl    28(%rdx), %esi
LBB0_16:
    movl    %esi, %eax
    popq    %rbp
    retq
    .cfi_endproc

    .globl  __Z4foo2hjPKj
    .align  4, 0x90
__Z4foo2hjPKj:                          ## @_Z4foo2hjPKj
    .cfi_startproc
## BB#0:
    pushq   %rbp
Ltmp3:
    .cfi_def_cfa_offset 16
Ltmp4:
    .cfi_offset %rbp, -16
    movq    %rsp, %rbp
Ltmp5:
    .cfi_def_cfa_register %rbp
    testb   $-128, %dil
    je  LBB1_2
## BB#1:
    xorl    (%rdx), %esi
LBB1_2:
    testb   $64, %dil
    je  LBB1_4
## BB#3:
    xorl    4(%rdx), %esi
LBB1_4:
    testb   $32, %dil
    je  LBB1_6
## BB#5:
    xorl    8(%rdx), %esi
LBB1_6:
    testb   $16, %dil
    je  LBB1_8
## BB#7:
    xorl    12(%rdx), %esi
LBB1_8:
    testb   $8, %dil
    je  LBB1_10
## BB#9:
    xorl    16(%rdx), %esi
LBB1_10:
    testb   $4, %dil
    je  LBB1_12
## BB#11:
    xorl    20(%rdx), %esi
LBB1_12:
    testb   $2, %dil
    je  LBB1_14
## BB#13:
    xorl    24(%rdx), %esi
LBB1_14:
    movl    %esi, %eax
    popq    %rbp
    retq
    .cfi_endproc

看看编译器在使用逻辑运算而不是条件语句的代码版本中是否也能很好地运行会很有趣。

下式给出：

std::uint32_t logical1(std::uint8_t uMsgByte, 
                       std::uint32_t crc, 
                       const std::uint32_t* pChkTableOffset)
{
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x80) - 1);
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x40) - 1);
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x20) - 1);
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x10) - 1);
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x8) - 1);
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x4) - 1);
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x2) - 1);
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x1) - 1);

    return crc;
}

生成的机器代码为：

8批次：

    movl    %edi, %eax     ; get uMsgByte into eax
    shll    $24, %eax      ; shift it left 24 bits so that bit 7 is in the sign bit
    sarl    $31, %eax      ; arithmetic shift right to copy the sign bit into all other bits
    andl    (%rdx), %eax   ; and the result with the value from the table
    xorl    %esi, %eax     ; exclusive-or into crc

所以简短的回答是肯定的 - 它表现得非常好（省略了pChkTableOffset的冗余增量）

更快吗？谁知道。可能不可测量 - 两种情况下的内存提取数量相同。编译器可以确定是否最好避免分支或者不是更好（取决于编译器优化的体系结构）。

它更优雅可读吗？对我自己，没有。这是我以前编写的代码：

c仍然是一种年轻语言
处理器非常简单，我可以更好地优化
处理器太慢了，我不得不

这些都不再适用。

Answer 3

如果这个校验和确实是CRC，那么有一种更有效的方法来实现它。

假设它是CRC16：

部首：

class CRC16
{
public:
    CRC16(const unsigned short poly);
    unsigned short CalcCRC(unsigned char * pbuf, int len);

protected:
    unsigned short CRCTab[256];
    unsigned long SwapBits(unsigned long swap, int bits);
};

实现：

CRC16::CRC16(const unsigned short poly)
{
    for(int i = 0; i < 256; i++) {
        CRCTab[i] = SwapBits(i, 8) << 8;
        for(int j = 0; j < 8; j++)
            CRCTab[i] = (CRCTab[i] << 1) ^ ((CRCTab[i] & 0x8000) ? poly : 0);
        CRCTab[i] = SwapBits(CRCTab[i], 16);
    }
}

unsigned long CRC16::SwapBits(unsigned long swap, int bits)
{
    unsigned long r = 0;
    for(int i = 0; i < bits; i++) {
        if(swap & 1) r |= 1 << (bits - i - 1);
        swap >>= 1;
    }
    return r;
}

unsigned short CRC16::CalcCRC(unsigned char * pbuf, int len)
{
    unsigned short r = 0;
    while(len--) r = (r >> 8) ^ CRCTab[(r & 0xFF) ^ *(pbuf++)];
    return r;
}

如您所见，消息的每个字节仅使用一次，而不是8次。

CRC8有类似的实现。

Answer 4

出于兴趣，扩展了alain关于预先计算CRC表的优秀建议，我发现这个类可以被修改以利用c ++ 14的constexpr：

#include <iostream>
#include <utility>
#include <string>

class CRC16
{
private:

    // the storage for the CRC table, to be computed at compile time
    unsigned short CRCTab[256];

    // private template-expanded constructor allows folded calls to SwapBits at compile time
    template<std::size_t...Is>
    constexpr CRC16(const unsigned short poly, std::integer_sequence<std::size_t, Is...>)
    : CRCTab { SwapBits(Is, 8) << 8 ... }
    {}

    // swap bits at compile time
    static constexpr unsigned long SwapBits(unsigned long swap, int bits)
    {
        unsigned long r = 0;
        for(int i = 0; i < bits; i++) {
            if(swap & 1) r |= 1 << (bits - i - 1);
            swap >>= 1;
        }
        return r;
    }


public:

    // public constexpr defers to private template expansion...
    constexpr CRC16(const unsigned short poly)
    : CRC16(poly, std::make_index_sequence<256>())
    {
        //... and then modifies the table - at compile time
        for(int i = 0; i < 256; i++) {
            for(int j = 0; j < 8; j++)
                CRCTab[i] = (CRCTab[i] << 1) ^ ((CRCTab[i] & 0x8000) ? poly : 0);
            CRCTab[i] = SwapBits(CRCTab[i], 16);
        }
    }

    // made const so that we can instantiate constexpr CRC16 objects
    unsigned short CalcCRC(const unsigned char * pbuf, int len) const
    {
        unsigned short r = 0;
        while(len--) r = (r >> 8) ^ CRCTab[(r & 0xFF) ^ *(pbuf++)];
        return r;
    }

};



int main()
{
    // create my constexpr CRC16 object at compile time
    constexpr CRC16 crctab(1234);

    // caclulate the CRC of something...
    using namespace std;
    auto s = "hello world"s;

    auto crc = crctab.CalcCRC(reinterpret_cast<const unsigned char*>(s.data()), s.size());

    cout << crc << endl;

    return 0;
}

然后CRC16（1234）的构造函数很好地归结为：

__ZZ4mainE6crctab:
    .short  0                       ## 0x0
    .short  9478                    ## 0x2506
    .short  18956                   ## 0x4a0c
    .short  28426                   ## 0x6f0a
    .short  601                     ## 0x259
    .short  10079                   ## 0x275f
    .short  18517                   ## 0x4855
    .short  27987                   ## 0x6d53
... etc.

并且整个字符串的CRC的计算变为：

        leaq    __ZZ4mainE6crctab(%rip), %rdi ; <- referencing const data :)
        movzwl  (%rdi,%rdx,2), %edx
        jmp     LBB0_8
LBB0_4:
        xorl    %edx, %edx
        jmp     LBB0_11
LBB0_6:
        xorl    %edx, %edx
LBB0_8:                                 ## %.lr.ph.i.preheader.split
        testl   %esi, %esi
        je      LBB0_11
## BB#9:
        leaq    __ZZ4mainE6crctab(%rip), %rsi
        .align  4, 0x90
LBB0_10:                                ## %.lr.ph.i
                                        ## =>This Inner Loop Header: Depth=1
        movzwl  %dx, %edi
        movzbl  %dh, %edx  # NOREX
        movzbl  %dil, %edi
        movzbl  (%rcx), %ebx
        xorq    %rdi, %rbx
        xorw    (%rsi,%rbx,2), %dx
        movzwl  %dx, %edi
        movzbl  %dh, %edx  # NOREX
        movzbl  %dil, %edi
        movzbl  1(%rcx), %ebx
        xorq    %rdi, %rbx
        xorw    (%rsi,%rbx,2), %dx
        addq    $2, %rcx
        addl    $-2, %eax
        jne     LBB0_10
LBB0_11:

bit hack vs条件语句内部循环

4 个答案: