Question

我有一个必须转换为整数的字节数组（unsigned char *）。整数表示超过三个字节。这就是我所做的

//bytes array is allocated and filled
//allocating space for intBuffer (uint32_t)
unsigned long i = 0;
uint32_t number;
for(; i<size_tot; i+=3){
    uint32_t number = (bytes[i]<<16) | (bytes[i+1]<<8) | bytes[i+2];
    intBuffer[number]++;
}

这段代码可以很好地完成工作，但由于内存中的三次访问（特别是size_tot的大值，按3000000的顺序），它的速度非常慢。有没有办法更快地完成它并提高性能？

Answer 1

正确的答案几乎总是：

编写正确的代码，启用优化，信任编译器。

下式给出：

void count_values(std::array<uint32_t, 256^3>& results,
                  const unsigned char* from,
                  const unsigned char* to)
{
    for(; from != to; from  = std::next(from, 3)) {
        ++results[(*from << 16) | (*std::next(from, 1) << 8) | *(std::next(from,2))];
    }
}

使用-O3

编译

收益率（内联说明性评论）：

__Z12count_valuesRNSt3__15arrayIjLm259EEEPKhS4_: ## @_Z12count_valuesRNSt3__15arrayIjLm259EEEPKhS4_
    .cfi_startproc
## BB#0:
    pushq   %rbp
Ltmp0:
    .cfi_def_cfa_offset 16
Ltmp1:
    .cfi_offset %rbp, -16
    movq    %rsp, %rbp
Ltmp2:
    .cfi_def_cfa_register %rbp
    jmp LBB0_2
    .align  4, 0x90
LBB0_1:                                 ## %.lr.ph
                                        ##   in Loop: Header=BB0_2 Depth=1
# dereference from and extend the 8-bit value to 32 bits
    movzbl  (%rsi), %eax
    shlq    $16, %rax            # shift left 16
    movzbl  1(%rsi), %ecx        # dereference *(from+1) and extend to 32bits by padding with zeros
    shlq    $8, %rcx             # shift left 8
    orq %rax, %rcx               # or into above result 
    movzbl  2(%rsi), %eax        # dreference *(from+2) and extend to 32bits
    orq %rcx, %rax               # or into above result
    incl    (%rdi,%rax,4)        # increment the correct counter
    addq    $3, %rsi             # from += 3
LBB0_2:                                 ## %.lr.ph
                                        ## =>This Inner Loop Header: Depth=1
    cmpq    %rdx, %rsi           # while from != to
    jne LBB0_1
## BB#3:                                ## %._crit_edge
    popq    %rbp
    retq
    .cfi_endproc

请注意，不需要偏离标准构造或标准调用。编译器生成完美的代码。

为了进一步证明这一点，让我们发疯并写一个自定义迭代器，它允许我们将函数减少到这个：

void count_values(std::array<uint32_t, 256^3>& results,
                  byte_triple_iterator from,
                  byte_triple_iterator to)
{
    assert(iterators_correct(from, to));
    while(from != to) {
        ++results[*from++];
    }
}

以下是这种迭代器的（基本）实现：

struct byte_triple_iterator
{
    constexpr byte_triple_iterator(const std::uint8_t* p)
    : _ptr(p)
    {}

    std::uint32_t operator*() const noexcept {
        return (*_ptr << 16) | (*std::next(_ptr, 1) << 8) | *(std::next(_ptr,2));
    }

    byte_triple_iterator& operator++() noexcept {
        _ptr = std::next(_ptr, 3);
        return *this;
    }

    byte_triple_iterator operator++(int) noexcept {
        auto copy = *this;
        _ptr = std::next(_ptr, 3);
        return copy;
    }

    constexpr const std::uint8_t* byte_ptr() const {
        return _ptr;
    }

private:

    friend bool operator<(const byte_triple_iterator& from, const byte_triple_iterator& to)
    {
        return from._ptr < to._ptr;
    }

    friend bool operator==(const byte_triple_iterator& from, const byte_triple_iterator& to)
    {
        return from._ptr == to._ptr;
    }

    friend bool operator!=(const byte_triple_iterator& from, const byte_triple_iterator& to)
    {
        return not(from == to);
    }

    friend std::ptrdiff_t byte_difference(const byte_triple_iterator& from, const byte_triple_iterator& to)
    {
        return to._ptr - from._ptr;
    }

    const std::uint8_t* _ptr;
};

bool iterators_correct(const byte_triple_iterator& from,
                       const byte_triple_iterator& to)
{
    if (not(from < to))
        return false;
    auto dist = to.byte_ptr() - from.byte_ptr();
    return dist % 3 == 0;
}

现在我们有什么？

断言检查我们的源确实是正确的长度（在调试版本中）
保证大小合适的输出结构

但它对我们的目标代码做了什么？（使用-O3 -DNDEBUG编译）

    .globl  __Z12count_valuesRNSt3__15arrayIjLm259EEE20byte_triple_iteratorS3_
    .align  4, 0x90
__Z12count_valuesRNSt3__15arrayIjLm259EEE20byte_triple_iteratorS3_: ## @_Z12count_valuesRNSt3__15arrayIjLm259EEE20byte_triple_iteratorS3_
    .cfi_startproc
## BB#0:
    pushq   %rbp
Ltmp3:
    .cfi_def_cfa_offset 16
Ltmp4:
    .cfi_offset %rbp, -16
    movq    %rsp, %rbp
Ltmp5:
    .cfi_def_cfa_register %rbp
    jmp LBB1_2
    .align  4, 0x90
LBB1_1:                                 ## %.lr.ph
                                        ##   in Loop: Header=BB1_2 Depth=1
    movzbl  (%rsi), %eax
    shlq    $16, %rax
    movzbl  1(%rsi), %ecx
    shlq    $8, %rcx
    orq %rax, %rcx
    movzbl  2(%rsi), %eax
    orq %rcx, %rax
    incl    (%rdi,%rax,4)
    addq    $3, %rsi
LBB1_2:                                 ## %.lr.ph
                                        ## =>This Inner Loop Header: Depth=1
    cmpq    %rdx, %rsi
    jne LBB1_1
## BB#3:                                ## %._crit_edge
    popq    %rbp
    retq
    .cfi_endproc

答案：没有 - 它同样有效。

上课？没有真的！相信你的编译器!!!

Answer 2

假设你想要计算所有不同的值（你的代码：intBuffer[number]++;）（intBuffer有2 ^ 24项），你可以尝试做一些loop unrolling：

而不是：

for(; i<size_tot; i+=3){
    uint32_t number = (bytes[i]<<16) | (bytes[i+1]<<8) | bytes[i+2];
    intBuffer[number]++;
}

做的：

for(; i<size_tot; i+=12){   // add extra ckeck here..

    intBuffer[(bytes[i]<<16)   | (bytes[i+1]<<8) | bytes[i+2]]++;
    intBuffer[(bytes[i+3]<<16) | (bytes[i+4]<<8) | bytes[i+5]]++;
    intBuffer[(bytes[i+6]<<16) | (bytes[i+7]<<8) | bytes[i+8]]++;
    intBuffer[(bytes[i+9]<<16) | (bytes[i+10]<<8) | bytes[i+11]]++;
}
// Add a small loop for the remaining bytes (no multiple of 12)

这将允许cpu 在一个时钟周期内执行多条指令（确保在最高级别设置编译器优化）。

您还需要额外检查bytes的最后一部分。

查看Instruction Pipelining。

指令流水线是一种在单个处理器中实现称为指令级并行性的并行形式的技术。因此，它允许比在给定时钟速率下可能的更快的CPU吞吐量（可以在单位时间内执行的指令数量）。基本指令周期被分解为一个称为管道的系列。不是按顺序处理每个指令（在开始下一个指令之前完成一条指令），每条指令被分成一系列步骤，因此可以并行执行不同的步骤，并且可以同时处理指令（启动一条指令）在完成前一个之前的指示。）

<强>更新：

但速度非常慢

实际上，对于3MB，这应该是有些即时的，即使使用原始代码（考虑到数据已经被缓存）。如何定义bytes？可能是operator[]正在做一些额外的边界检查吗？

Answer 3

首先确保编译器优化转向最高级别。

我想我会尝试一下：

unsigned char* pBytes = bytes;
uint32_t number;

for(unsigned long i = 0; i<size_tot; i+=3){
    number = *pBytes << 16;
    ++pBytes;
    number = number | (*pBytes << 8);
    ++pBytes;
    number = number | *pBytes;
    ++pBytes;

    ++intBuffer[number];
}

编译之后，我会检查生成的汇编程序代码是如何查看更改实际上是否有所不同。

Answer 4

尝试一次读取一个单词，然后提取所需的值。这应该比逐字节读取更有效

以下是64位little-endian系统的示例实现，一次读取3个64位值

void count(uint8_t* bytes, int* intBuffer, uint32_t size_tot)
{
    assert(size_tot > 7);
    uint64_t num1, num2, num3;
    uint8_t *bp = bytes;
    while ((uintptr_t)bp % 8) // make sure that the pointer is properly aligned
    {
        num1 = (bp[2] << 16) | (bp[1] << 8) | bp[0];
        intBuffer[num1]++;
        bp += 3;
    }

    uint64_t* ip = (uint64_t*)bp;
    while ((uint8_t*)(ip + 2) < bytes + size_tot)
    {
        num1 = *ip++;
        num2 = *ip++;
        num3 = *ip++;

        intBuffer[num1 & 0xFFFFFF]++;
        intBuffer[(num1 >> 24) & 0xFFFFFF]++;
        intBuffer[(num1 >> 48) | ((num2 & 0xFF) << 16)]++;
        intBuffer[(num2 >> 8) & 0xFFFFFF]++;
        intBuffer[(num2 >> 32) & 0xFFFFFF]++;
        intBuffer[(num2 >> 56) | ((num3 & 0xFFFF) << 8)]++;
        intBuffer[(num3 >> 16) & 0xFFFFFF]++;
        intBuffer[num3 >> 40]++;
    }

    bp = (uint8_t*)ip;
    while (bp < bytes + size_tot)
    {
        num1 = (bp[2] << 16) | (bp[1] << 8) | bp[0];
        intBuffer[num1]++;
        bp += 3;
    }
}

您可以检查Compiler Explorer上的编译器输出。当然，智能编译器可能已经知道如何做到这一点，但大多数情况下都没有。正如您从Godbolt链接中看到的那样，compilers will use a bunch of movzx to read the separate bytes而不是读取整个寄存器。 ICC将进行更多的循环展开，但Clang和GCC不会

类似地，对于32位体系结构，您还将在每次迭代时读取3个“单词”。此外，您可能需要执行一些手动循环展开，而不是依赖编译器来执行此操作。 Here's an example on 32-bit little endian machines。这可以很容易adapted for big endian

intBuffer[num1 >> 8]++;
intBuffer[((num1 & 0xFF) << 16) | (num2 >> 16)]++;
intBuffer[((num2 & 0xFFFF) << 8) | (num3 >> 24)]++;
intBuffer[num3 & 0xFFFFFF]++;

但是为了获得更高的性能，您可能希望找到像SSE或AVX这样的SIMD解决方案

将字节转换为unsigned int

4 个答案: