我有一个unsigned char
类型的大数组(大约1 MB)(即uint8_t
)。我知道它中的字节只能有5个值中的一个(即0,1,2,3,4)。此外,我们不需要保留输入中的'3',当我们编码/解码时它们可以安全地丢失。
所以我猜测位压缩是压缩它的最简单方法,所以每个字节都可以转换为2位(00
,01
...,11
)。
如上所述,可以删除值3的所有元素(即保存为0)。这让我可以选择将'4'保存为'3'。在重建(解压缩)时,我将3还原为4。
我为压缩编写了一个小函数,但我觉得这个操作太多而且效率不够高。关于如何使其更高效或更快(希望保持可读性)的任何代码片段或建议将非常有用。
/// Compress by packing ...
void compressByPacking (uint8_t* out, uint8_t* in, uint32_t length)
{
for (int loop = 0; loop < length/4; loop ++, in += 4, out++)
{
uint8_t temp[4];
for (int small_loop = 0; small_loop < 4; small_loop++)
{
temp[small_loop] = *in; // Load into local variable
if (temp[small_loop] == 3) // 3's are discarded
temp[small_loop] = 0;
else if (temp[small_loop] == 4) // and 4's are converted to 3
temp[small_loop] = 3;
} // end small loop
// Pack the bits into write pointer
*out = (uint8_t)((temp[0] & 0x03) << 6) |
((temp[1] & 0x03) << 4) |
((temp[2] & 0x03) << 2) |
((temp[3] & 0x03));
} // end loop
}
答案 0 :(得分:2)
你的函数有一个错误:加载小数组时,你应该写:
temp[small_loop] = in[small_loop];
您可以使用查找表删除测试,无论是在源数据上,还是在某些中间结果上更有效:
在下面的代码中,我使用一个小表lookup5
将值0,1,2,3,4
转换为0,1,2,0,3
,并使用较大的表来映射来自源的4个3位值的组数组到打包格式的相应字节值:
#include <stdint.h>
/// Compress by packing ...
void compressByPacking0(uint8_t *out, uint8_t *in, uint32_t length) {
static uint8_t lookup[4096];
static const uint8_t lookup5[8] = { 0, 1, 2, 0, 3, 0, 0, 0 };
if (lookup[0] == 0) {
/* initialize lookup table */
for (int i = 0; i < 4096; i++) {
lookup[i] = (lookup5[(i >> 0) & 7] << 0) +
(lookup5[(i >> 3) & 7] << 2) +
(lookup5[(i >> 6) & 7] << 4) +
(lookup5[(i >> 9) & 7] << 6);
}
}
for (; length >= 4; length -= 4, in += 4, out++) {
*out = lookup[(in[0] << 9) + (in[1] << 6) + (in[2] << 3) + (in[3] << 0)];
}
uint8_t last = 0;
switch (length) {
case 3:
last |= lookup5[in[2]] << 4;
/* fall through */
case 2:
last |= lookup5[in[1]] << 2;
/* fall through */
case 1:
last |= lookup5[in[0]] << 0;
*out = last;
break;
}
}
注意:
代码假定数组不包含指定范围之外的值。可以以最低的成本实现对伪输入的额外保护。
虚拟<< 0
仅用于对称并编译为无额外代码。
可以通过构建时脚本或一组宏静态初始化查找表。
您可能希望将此循环展开4次或更多次,或让编译器决定。
您还可以使用这种更简单的解决方案,更频繁地访问较小的查找表。仔细的基准测试将告诉您哪个目标系统更有效:
/// Compress by packing ...
void compressByPacking1(uint8_t *out, uint8_t *in, uint32_t length) {
static const uint8_t lookup[4][5] = {
{ 0 << 6, 1 << 6, 2 << 6, 0 << 6, 3 << 6 },
{ 0 << 4, 1 << 4, 2 << 4, 0 << 4, 3 << 4 },
{ 0 << 2, 1 << 2, 2 << 2, 0 << 2, 3 << 2 },
{ 0 << 0, 1 << 0, 2 << 0, 0 << 0, 3 << 0 },
};
for (; length >= 4; length -= 4, in += 4, out++) {
*out = lookup[0][in[0]] + lookup[1][in[1]] +
lookup[2][in[2]] + lookup[3][in[3]];
}
uint8_t last = 0;
switch (length) {
case 3:
last |= lookup[2][in[2]];
/* fall through */
case 2:
last |= lookup[1][in[1]];
/* fall through */
case 1:
last |= lookup[0][in[0]];
*out = last;
break;
}
}
这是另一种方法,没有任何表格:
/// Compress by packing ...
void compressByPacking2(uint8_t *out, uint8_t *in, uint32_t length) {
#define BITS ((1 << 2) + (2 << 4) + (3 << 8))
for (; length >= 4; length -= 4, in += 4, out++) {
*out = ((BITS << 6 >> (in[0] + in[0])) & 0xC0) +
((BITS << 4 >> (in[1] + in[1])) & 0x30) +
((BITS << 2 >> (in[2] + in[2])) & 0x0C) +
((BITS << 0 >> (in[3] + in[3])) & 0x03);
}
uint8_t last = 0;
switch (length) {
case 3:
last |= (BITS << 2 >> (in[2] + in[2])) & 0x0C;
/* fall through */
case 2:
last |= (BITS << 4 >> (in[1] + in[1])) & 0x30;
/* fall through */
case 1:
last |= (BITS << 6 >> (in[0] + in[0])) & 0xC0;
*out = last;
break;
}
}
以下是我的系统的比较基准测试,Macbook pro运行OS / X,clang -O2
:
compressByPacking(1MB) -> 0.867ms
compressByPacking0(1MB) -> 0.445ms
compressByPacking1(1MB) -> 0.538ms
compressByPacking2(1MB) -> 0.824ms
compressByPacking0
变种速度最快,几乎是代码的两倍。
这有点令人失望,但代码是可移植的。您可以使用手动编码的SSE优化来提高性能。
答案 1 :(得分:1)
我有一个大阵列(大约1 MB)
这是一个错字,你的目标严重老化,或者在你的应用程序的关键路径中反复调用这个压缩操作。
有关如何提高效率的任何代码段或建议 更快(希望保持可读性)将非常有用。
通常,您可以通过实际测量性能和检查生成的代码来找到最佳信息。使用分析器确定正在执行的代码,缓存未命中和管道停顿的位置 - 这些可以帮助您调整算法。
例如,您选择了4个元素的步幅。这只是因为您将四个输入元素映射到一个字节?您是否可以使用本机SIMD指令/内在函数一次操作更多元素?
另外,您如何编译目标以及编译器如何优化代码?
让我们问clang
是否在尝试优化代码时遇到任何问题:
$ clang -fvectorize -O3 -Rpass-missed=licm -c tryme.c
tryme.c:11:28: remark: failed to move load with loop-invariant address because the loop may invalidate its value [-Rpass-missed=licm]
temp[small_loop] = *in; // Load into local variable
^
tryme.c:21:25: remark: failed to move load with loop-invariant address because the loop may invalidate its value [-Rpass-missed=licm]
*out = (uint8_t)((temp[0] & 0x03) << 6) |
^
tryme.c:22:25: remark: failed to move load with loop-invariant address because the loop may invalidate its value [-Rpass-missed=licm]
((temp[1] & 0x03) << 4) |
^
tryme.c:23:25: remark: failed to move load with loop-invariant address because the loop may invalidate its value [-Rpass-missed=licm]
((temp[2] & 0x03) << 2) |
^
tryme.c:24:25: remark: failed to move load with loop-invariant address because the loop may invalidate its value [-Rpass-missed=licm]
((temp[3] & 0x03));
^
我不确定但也许别名分析是因为它认为它无法移动这种负载。尝试使用__restrict__
来查看是否有效。
$ clang -fvectorize -O3 -Rpass-analysis=loop-vectorize -c tryme.c
tryme.c:13:13: remark: loop not vectorized: loop contains a switch statement [-Rpass-analysis=loop-vectorize]
if (temp[small_loop] == 3) // 3's are discarded
除非你改变你的算法,否则我无法想到你能做些什么。如果压缩率令人满意而不删除3
,则可以消除此问题。
那么生成的代码是什么样的?看看下面。你怎么能用手写好?如果您可以自己更好地编写它,可以这样做或者将其反馈到算法中以帮助指导编译器。
编译的代码是否利用了目标的指令集和寄存器?
最重要的是 - 尝试执行它,看看你花费的时间最多。分支错误预测,未对齐的负载失速?也许你可以做些什么。使用您对输入数据频率的了解,为编译器提供有关分支的提示。
$ objdump -d --source tryme.o
...
0000000000000000 <compressByPacking>:
#include <stdint.h>
void compressByPacking (uint8_t* out, uint8_t* in, uint32_t length)
{
for (int loop = 0; loop < length/4; loop ++, in += 4, out++)
0: c1 ea 02 shr $0x2,%edx
3: 0f 84 86 00 00 00 je 8f <compressByPacking+0x8f>
9: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
{
uint8_t temp[4];
for (int small_loop = 0; small_loop < 4; small_loop++)
{
temp[small_loop] = *in; // Load into local variable
10: 8a 06 mov (%rsi),%al
if (temp[small_loop] == 3) // 3's are discarded
12: 3c 04 cmp $0x4,%al
14: 74 3a je 50 <compressByPacking+0x50>
16: 3c 03 cmp $0x3,%al
18: 41 88 c0 mov %al,%r8b
1b: 75 03 jne 20 <compressByPacking+0x20>
1d: 45 31 c0 xor %r8d,%r8d
20: 3c 04 cmp $0x4,%al
22: 74 33 je 57 <compressByPacking+0x57>
24: 3c 03 cmp $0x3,%al
26: 88 c1 mov %al,%cl
28: 75 02 jne 2c <compressByPacking+0x2c>
2a: 31 c9 xor %ecx,%ecx
2c: 3c 04 cmp $0x4,%al
2e: 74 2d je 5d <compressByPacking+0x5d>
30: 3c 03 cmp $0x3,%al
32: 41 88 c1 mov %al,%r9b
35: 75 03 jne 3a <compressByPacking+0x3a>
37: 45 31 c9 xor %r9d,%r9d
3a: 3c 04 cmp $0x4,%al
3c: 74 26 je 64 <compressByPacking+0x64>
3e: 3c 03 cmp $0x3,%al
40: 75 24 jne 66 <compressByPacking+0x66>
42: 31 c0 xor %eax,%eax
44: eb 20 jmp 66 <compressByPacking+0x66>
46: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
4d: 00 00 00
50: 41 b0 03 mov $0x3,%r8b
53: 3c 04 cmp $0x4,%al
55: 75 cd jne 24 <compressByPacking+0x24>
57: b1 03 mov $0x3,%cl
59: 3c 04 cmp $0x4,%al
5b: 75 d3 jne 30 <compressByPacking+0x30>
5d: 41 b1 03 mov $0x3,%r9b
60: 3c 04 cmp $0x4,%al
62: 75 da jne 3e <compressByPacking+0x3e>
64: b0 03 mov $0x3,%al
temp[small_loop] = 3;
} // end small loop
// Pack the bits into write pointer
*out = (uint8_t)((temp[0] & 0x03) << 6) |
66: 41 c0 e0 06 shl $0x6,%r8b
((temp[1] & 0x03) << 4) |
6a: c0 e1 04 shl $0x4,%cl
6d: 80 e1 30 and $0x30,%cl
temp[small_loop] = 3;
} // end small loop
// Pack the bits into write pointer
*out = (uint8_t)((temp[0] & 0x03) << 6) |
70: 44 08 c1 or %r8b,%cl
((temp[1] & 0x03) << 4) |
((temp[2] & 0x03) << 2) |
73: 41 c0 e1 02 shl $0x2,%r9b
77: 41 80 e1 0c and $0xc,%r9b
((temp[3] & 0x03));
7b: 24 03 and $0x3,%al
} // end small loop
// Pack the bits into write pointer
*out = (uint8_t)((temp[0] & 0x03) << 6) |
((temp[1] & 0x03) << 4) |
7d: 44 08 c8 or %r9b,%al
((temp[2] & 0x03) << 2) |
80: 08 c8 or %cl,%al
temp[small_loop] = 3;
} // end small loop
// Pack the bits into write pointer
*out = (uint8_t)((temp[0] & 0x03) << 6) |
82: 88 07 mov %al,(%rdi)
#include <stdint.h>
void compressByPacking (uint8_t* out, uint8_t* in, uint32_t length)
{
for (int loop = 0; loop < length/4; loop ++, in += 4, out++)
84: 48 83 c6 04 add $0x4,%rsi
88: 48 ff c7 inc %rdi
8b: ff ca dec %edx
8d: 75 81 jne 10 <compressByPacking+0x10>
((temp[1] & 0x03) << 4) |
((temp[2] & 0x03) << 2) |
((temp[3] & 0x03));
} // end loop
}
8f: c3 retq
答案 2 :(得分:1)
在所有关于性能的兴奋中,功能被忽略了。代码破了。
// temp[small_loop] = *in; // Load into local variable
temp[small_loop] = in[small_loop];
替代方案:
一个简单的紧密循环怎么样?
使用const
和restrict
进行各种优化。
void compressByPacking1(uint8_t* restrict out, const uint8_t* restrict in,
uint32_t length) {
static const uint8_t t[5] = { 0, 1, 2, 0, 3 };
uint32_t length4 = length / 4;
unsigned v = 0;
uint32_t i;
for (i = 0; i < length4; i++) {
for (unsigned j=0; j < 4; j++) {
v <<= 2;
v |= t[*in++];
}
out[i] = (uint8_t) v;
}
if (length & 3) {
v = 0;
for (unsigned j; j < 4; j++) {
v <<= 2;
if (j < (length & 3)) {
v |= t[*in++];
}
}
out[i] = (uint8_t) v;
}
}
经过测试,发现此代码的速度约为270%(41 vs 15)(YMMV)。
经测试并发现与OP(已更正)代码相同的输出
答案 3 :(得分:-1)
更新:已测试
不安全版本是最快的 - 在另一个答案中比其他版本最快。用VS2017测试
const uint8_t table[4][5] =
{ { 0 << 0,1 << 0,2 << 0,0 << 0,3 << 0 },
{ 0 << 2,1 << 2,2 << 2,0 << 2,3 << 2 },
{ 0 << 4,1 << 4,2 << 4,0 << 4,3 << 4 },
{ 0 << 6,1 << 6,2 << 6,0 << 6,3 << 6 },
};
void code(uint8_t *in, uint8_t *out, uint32_t len)
{
memset(out, 0, len / 4 + 1);
for (uint32_t i = 0; i < len; i++)
out[i / 4] |= table[i & 3][in[i] % 5];
}
void code_unsafe(uint8_t *in, uint8_t *out, uint32_t len)
{
for (uint32_t i = 0; i < len; i += 4, in += 4, out++)
{
*out = table[0][in[0]] | table[1][in[1]] | table[2][in[2]] | table[3][in[3]];
}
}
要检查它是如何编写的,编译它就足够了 - 甚至在线
我的编码函数非常简单 - 仅用于比较编译器生成的代码,而不是测试。
答案 4 :(得分:-2)
这看起来更清楚吗?
chol2inv()