Question

我有一个unsigned char类型的大数组（大约1 MB）（即uint8_t）。我知道它中的字节只能有5个值中的一个（即0,1,2,3,4）。此外，我们不需要保留输入中的'3'，当我们编码/解码时它们可以安全地丢失。

所以我猜测位压缩是压缩它的最简单方法，所以每个字节都可以转换为2位（00，01 ...，11）。

如上所述，可以删除值3的所有元素（即保存为0）。这让我可以选择将'4'保存为'3'。在重建（解压缩）时，我将3还原为4。

我为压缩编写了一个小函数，但我觉得这个操作太多而且效率不够高。关于如何使其更高效或更快（希望保持可读性）的任何代码片段或建议将非常有用。

/// Compress by packing ...
void compressByPacking (uint8_t* out, uint8_t* in, uint32_t length)
{
    for (int loop = 0; loop < length/4; loop ++, in += 4, out++)
    {
      uint8_t temp[4];

      for (int small_loop = 0; small_loop < 4; small_loop++)
      {
        temp[small_loop] = *in;           // Load into local variable

        if (temp[small_loop] == 3)        // 3's are discarded
          temp[small_loop] = 0;
        else if (temp[small_loop] == 4)   // and 4's are converted to 3
          temp[small_loop] = 3;

      } // end small loop

      // Pack the bits into write pointer
      *out = (uint8_t)((temp[0] & 0x03) << 6) |
                      ((temp[1] & 0x03) << 4) |
                      ((temp[2] & 0x03) << 2) |
                      ((temp[3] & 0x03));

    } // end loop
 }

编辑使问题更加清晰，因为我试图将5个值保存为2位。感谢@Brian Cain建议的措辞。
Cross-posted on Code Review.

Answer 1

你的函数有一个错误：加载小数组时，你应该写：

    temp[small_loop] = in[small_loop];

您可以使用查找表删除测试，无论是在源数据上，还是在某些中间结果上更有效：

在下面的代码中，我使用一个小表lookup5将值0,1,2,3,4转换为0,1,2,0,3，并使用较大的表来映射来自源的4个3位值的组数组到打包格式的相应字节值：

#include <stdint.h>

/// Compress by packing ...
void compressByPacking0(uint8_t *out, uint8_t *in, uint32_t length) {
    static uint8_t lookup[4096];
    static const uint8_t lookup5[8] = { 0, 1, 2, 0, 3, 0, 0, 0 };

    if (lookup[0] == 0) {    
        /* initialize lookup table */
        for (int i = 0; i < 4096; i++) {
            lookup[i] = (lookup5[(i >> 0) & 7] << 0) +
                        (lookup5[(i >> 3) & 7] << 2) +
                        (lookup5[(i >> 6) & 7] << 4) +
                        (lookup5[(i >> 9) & 7] << 6);
        }
    }
    for (; length >= 4; length -= 4, in += 4, out++) {
         *out = lookup[(in[0] << 9) + (in[1] << 6) + (in[2] << 3) + (in[3] << 0)];
    }
    uint8_t last = 0;
    switch (length) {
      case 3:
        last |= lookup5[in[2]] << 4;
        /* fall through */
      case 2:
        last |= lookup5[in[1]] << 2;
        /* fall through */
      case 1:
        last |= lookup5[in[0]] << 0;
        *out = last;
        break;
    }
}

注意：

代码假定数组不包含指定范围之外的值。可以以最低的成本实现对伪输入的额外保护。
虚拟<< 0仅用于对称并编译为无额外代码。
可以通过构建时脚本或一组宏静态初始化查找表。
您可能希望将此循环展开4次或更多次，或让编译器决定。

您还可以使用这种更简单的解决方案，更频繁地访问较小的查找表。仔细的基准测试将告诉您哪个目标系统更有效：

/// Compress by packing ...
void compressByPacking1(uint8_t *out, uint8_t *in, uint32_t length) {
    static const uint8_t lookup[4][5] = {
        { 0 << 6, 1 << 6, 2 << 6, 0 << 6, 3 << 6 },
        { 0 << 4, 1 << 4, 2 << 4, 0 << 4, 3 << 4 },
        { 0 << 2, 1 << 2, 2 << 2, 0 << 2, 3 << 2 },
        { 0 << 0, 1 << 0, 2 << 0, 0 << 0, 3 << 0 },
    };

    for (; length >= 4; length -= 4, in += 4, out++) {
         *out = lookup[0][in[0]] + lookup[1][in[1]] +
                lookup[2][in[2]] + lookup[3][in[3]];
    }
    uint8_t last = 0;
    switch (length) {
      case 3:
        last |= lookup[2][in[2]];
        /* fall through */
      case 2:
        last |= lookup[1][in[1]];
        /* fall through */
      case 1:
        last |= lookup[0][in[0]];
        *out = last;
        break;
    }
}

这是另一种方法，没有任何表格：

/// Compress by packing ...
void compressByPacking2(uint8_t *out, uint8_t *in, uint32_t length) {
#define BITS ((1 << 2) + (2 << 4) + (3 << 8))
    for (; length >= 4; length -= 4, in += 4, out++) {
         *out = ((BITS << 6 >> (in[0] + in[0])) & 0xC0) +
                ((BITS << 4 >> (in[1] + in[1])) & 0x30) +
                ((BITS << 2 >> (in[2] + in[2])) & 0x0C) +
                ((BITS << 0 >> (in[3] + in[3])) & 0x03);
    }
    uint8_t last = 0;
    switch (length) {
      case 3:
        last |= (BITS << 2 >> (in[2] + in[2])) & 0x0C;
        /* fall through */
      case 2:
        last |= (BITS << 4 >> (in[1] + in[1])) & 0x30;
        /* fall through */
      case 1:
        last |= (BITS << 6 >> (in[0] + in[0])) & 0xC0;
        *out = last;
        break;
    }
}

以下是我的系统的比较基准测试，Macbook pro运行OS / X，clang -O2：

compressByPacking(1MB) -> 0.867ms
compressByPacking0(1MB) -> 0.445ms
compressByPacking1(1MB) -> 0.538ms
compressByPacking2(1MB) -> 0.824ms

compressByPacking0变种速度最快，几乎是代码的两倍。这有点令人失望，但代码是可移植的。您可以使用手动编码的SSE优化来提高性能。

Answer 2

我有一个大阵列（大约1 MB）

这是一个错字，你的目标严重老化，或者在你的应用程序的关键路径中反复调用这个压缩操作。

有关如何提高效率的任何代码段或建议更快（希望保持可读性）将非常有用。

通常，您可以通过实际测量性能和检查生成的代码来找到最佳信息。使用分析器确定正在执行的代码，缓存未命中和管道停顿的位置 - 这些可以帮助您调整算法。

例如，您选择了4个元素的步幅。这只是因为您将四个输入元素映射到一个字节？您是否可以使用本机SIMD指令/内在函数一次操作更多元素？

另外，您如何编译目标以及编译器如何优化代码？

让我们问clang是否在尝试优化代码时遇到任何问题：

$ clang -fvectorize  -O3  -Rpass-missed=licm -c tryme.c 
tryme.c:11:28: remark: failed to move load with loop-invariant address because the loop may invalidate its value [-Rpass-missed=licm]
        temp[small_loop] = *in;           // Load into local variable
                           ^
tryme.c:21:25: remark: failed to move load with loop-invariant address because the loop may invalidate its value [-Rpass-missed=licm]
      *out = (uint8_t)((temp[0] & 0x03) << 6) |
                        ^
tryme.c:22:25: remark: failed to move load with loop-invariant address because the loop may invalidate its value [-Rpass-missed=licm]
                      ((temp[1] & 0x03) << 4) |
                        ^
tryme.c:23:25: remark: failed to move load with loop-invariant address because the loop may invalidate its value [-Rpass-missed=licm]
                      ((temp[2] & 0x03) << 2) |
                        ^
tryme.c:24:25: remark: failed to move load with loop-invariant address because the loop may invalidate its value [-Rpass-missed=licm]
                      ((temp[3] & 0x03));
                        ^

我不确定但也许别名分析是因为它认为它无法移动这种负载。尝试使用__restrict__来查看是否有效。

$ clang -fvectorize  -O3  -Rpass-analysis=loop-vectorize  -c tryme.c 
tryme.c:13:13: remark: loop not vectorized: loop contains a switch statement [-Rpass-analysis=loop-vectorize]
        if (temp[small_loop] == 3)        // 3's are discarded

除非你改变你的算法，否则我无法想到你能做些什么。如果压缩率令人满意而不删除3，则可以消除此问题。

那么生成的代码是什么样的？看看下面。你怎么能用手写好？如果您可以自己更好地编写它，可以这样做或者将其反馈到算法中以帮助指导编译器。

编译的代码是否利用了目标的指令集和寄存器？

最重要的是 - 尝试执行它，看看你花费的时间最多。分支错误预测，未对齐的负载失速？也许你可以做些什么。使用您对输入数据频率的了解，为编译器提供有关分支的提示。

$ objdump -d --source tryme.o
...
0000000000000000 <compressByPacking>:
#include <stdint.h>

void compressByPacking (uint8_t* out, uint8_t* in, uint32_t length)
{
    for (int loop = 0; loop < length/4; loop ++, in += 4, out++)
   0:   c1 ea 02                shr    $0x2,%edx
   3:   0f 84 86 00 00 00       je     8f <compressByPacking+0x8f>
   9:   0f 1f 80 00 00 00 00    nopl   0x0(%rax)
    {
      uint8_t temp[4];

      for (int small_loop = 0; small_loop < 4; small_loop++)
      {
        temp[small_loop] = *in;           // Load into local variable
  10:   8a 06                   mov    (%rsi),%al

        if (temp[small_loop] == 3)        // 3's are discarded
  12:   3c 04                   cmp    $0x4,%al
  14:   74 3a                   je     50 <compressByPacking+0x50>
  16:   3c 03                   cmp    $0x3,%al
  18:   41 88 c0                mov    %al,%r8b
  1b:   75 03                   jne    20 <compressByPacking+0x20>
  1d:   45 31 c0                xor    %r8d,%r8d
  20:   3c 04                   cmp    $0x4,%al
  22:   74 33                   je     57 <compressByPacking+0x57>
  24:   3c 03                   cmp    $0x3,%al
  26:   88 c1                   mov    %al,%cl
  28:   75 02                   jne    2c <compressByPacking+0x2c>
  2a:   31 c9                   xor    %ecx,%ecx
  2c:   3c 04                   cmp    $0x4,%al
  2e:   74 2d                   je     5d <compressByPacking+0x5d>
  30:   3c 03                   cmp    $0x3,%al
  32:   41 88 c1                mov    %al,%r9b
  35:   75 03                   jne    3a <compressByPacking+0x3a>
  37:   45 31 c9                xor    %r9d,%r9d
  3a:   3c 04                   cmp    $0x4,%al
  3c:   74 26                   je     64 <compressByPacking+0x64>
  3e:   3c 03                   cmp    $0x3,%al
  40:   75 24                   jne    66 <compressByPacking+0x66>
  42:   31 c0                   xor    %eax,%eax
  44:   eb 20                   jmp    66 <compressByPacking+0x66>
  46:   66 2e 0f 1f 84 00 00    nopw   %cs:0x0(%rax,%rax,1)
  4d:   00 00 00 
  50:   41 b0 03                mov    $0x3,%r8b
  53:   3c 04                   cmp    $0x4,%al
  55:   75 cd                   jne    24 <compressByPacking+0x24>
  57:   b1 03                   mov    $0x3,%cl
  59:   3c 04                   cmp    $0x4,%al
  5b:   75 d3                   jne    30 <compressByPacking+0x30>
  5d:   41 b1 03                mov    $0x3,%r9b
  60:   3c 04                   cmp    $0x4,%al
  62:   75 da                   jne    3e <compressByPacking+0x3e>
  64:   b0 03                   mov    $0x3,%al
          temp[small_loop] = 3;

      } // end small loop

      // Pack the bits into write pointer
      *out = (uint8_t)((temp[0] & 0x03) << 6) |
  66:   41 c0 e0 06             shl    $0x6,%r8b
                      ((temp[1] & 0x03) << 4) |
  6a:   c0 e1 04                shl    $0x4,%cl
  6d:   80 e1 30                and    $0x30,%cl
          temp[small_loop] = 3;

      } // end small loop

      // Pack the bits into write pointer
      *out = (uint8_t)((temp[0] & 0x03) << 6) |
  70:   44 08 c1                or     %r8b,%cl
                      ((temp[1] & 0x03) << 4) |
                      ((temp[2] & 0x03) << 2) |
  73:   41 c0 e1 02             shl    $0x2,%r9b
  77:   41 80 e1 0c             and    $0xc,%r9b
                      ((temp[3] & 0x03));
  7b:   24 03                   and    $0x3,%al

      } // end small loop

      // Pack the bits into write pointer
      *out = (uint8_t)((temp[0] & 0x03) << 6) |
                      ((temp[1] & 0x03) << 4) |
  7d:   44 08 c8                or     %r9b,%al
                      ((temp[2] & 0x03) << 2) |
  80:   08 c8                   or     %cl,%al
          temp[small_loop] = 3;

      } // end small loop

      // Pack the bits into write pointer
      *out = (uint8_t)((temp[0] & 0x03) << 6) |
  82:   88 07                   mov    %al,(%rdi)
#include <stdint.h>

void compressByPacking (uint8_t* out, uint8_t* in, uint32_t length)
{
    for (int loop = 0; loop < length/4; loop ++, in += 4, out++)
  84:   48 83 c6 04             add    $0x4,%rsi
  88:   48 ff c7                inc    %rdi
  8b:   ff ca                   dec    %edx
  8d:   75 81                   jne    10 <compressByPacking+0x10>
                      ((temp[1] & 0x03) << 4) |
                      ((temp[2] & 0x03) << 2) |
                      ((temp[3] & 0x03));

    } // end loop
 }
  8f:   c3                      retq

Answer 3

在所有关于性能的兴奋中，功能被忽略了。代码破了。

    // temp[small_loop] = *in;           // Load into local variable
    temp[small_loop] = in[small_loop];

替代方案：

一个简单的紧密循环怎么样？

使用const和restrict进行各种优化。

void compressByPacking1(uint8_t* restrict out, const uint8_t* restrict in,
    uint32_t length) {
  static const uint8_t t[5] = { 0, 1, 2, 0, 3 };
  uint32_t length4 = length / 4;
  unsigned v = 0;
  uint32_t i;
  for (i = 0; i < length4; i++) {
    for (unsigned j=0; j < 4; j++) {
      v <<= 2;
      v |= t[*in++];
    }
    out[i] = (uint8_t) v;
  }
  if (length & 3) {
    v = 0;
    for (unsigned j; j < 4; j++) {
      v <<= 2;
      if (j < (length & 3)) {
        v |= t[*in++];
      }
    }
    out[i] = (uint8_t) v;
  }
}

经过测试，发现此代码的速度约为270％（41 vs 15）（YMMV）。
经测试并发现与OP（已更正）代码相同的输出

Answer 4

更新：已测试

不安全版本是最快的 - 在另一个答案中比其他版本最快。用VS2017测试

const uint8_t table[4][5] = 
{ { 0 << 0,1 << 0,2 << 0,0 << 0,3 << 0 },
  { 0 << 2,1 << 2,2 << 2,0 << 2,3 << 2 },
  { 0 << 4,1 << 4,2 << 4,0 << 4,3 << 4 },
  { 0 << 6,1 << 6,2 << 6,0 << 6,3 << 6 },
};



void code(uint8_t *in, uint8_t *out, uint32_t len)
{
    memset(out, 0, len / 4 + 1);
    for (uint32_t i = 0; i < len; i++)
        out[i / 4] |= table[i & 3][in[i] % 5];
}

void code_unsafe(uint8_t *in, uint8_t *out, uint32_t len)
{
    for (uint32_t i = 0; i < len; i += 4, in += 4, out++)
    {
        *out = table[0][in[0]] | table[1][in[1]] | table[2][in[2]] | table[3][in[3]];
    }
}

要检查它是如何编写的，编译它就足够了 - 甚至在线

https://godbolt.org/g/Z75NQV

我的编码函数非常简单 - 仅用于比较编译器生成的代码，而不是测试。

Answer 5

这看起来更清楚吗？

chol2inv()

使用C中的位打包压缩'char'数组

5 个答案: