Question

我有一个案例，我需要压缩很多通常很小的值。因此，我使用可变长度字节编码（ULEB128来压缩它们，具体而言）：

size_t
compress_unsigned_int(unsigned int n, char* data)
{
  size_t size = 0;
  while (n  > 127)
  {
    ++size;
    *data++ = (n & 127)|128;
    n >>= 7;
  }
  *data++ = n;
  return ++size;
}

是否有更有效的方法（可能使用SSE）？

编辑：经过此压缩后，结果将存储到data，并占用size个字节。然后，在下一个unsigned int上调用压缩函数。

Answer 1

您要做的第一件事是针对当前代码测试任何可能的解决方案。

我想你可能想尝试摆脱数据依赖，让处理器同时做更多的工作。

什么是数据依赖？当数据流过您的函数时，n的当前值取决于n的先前值，这取决于之前的值...这是一个长链数据依赖。在下面的代码中，n永远不会被修改，因此处理器可以“跳过”并同时执行几个不同的操作，而无需等待计算新的n。

// NOTE: This code is actually incorrect, as caf noted.
// The byte order is reversed.
size_t
compress_unsigned_int(unsigned int n, char *data)
{
    if (n < (1U << 14)) {
        if (n < (1U << 7)) {
            data[0] = n;
            return 1;
        } else {
            data[0] = (n >> 7) | 0x80;
            data[1] = n & 0x7f;
            return 2;
        }
    } else if (n < (1U << 28)) {
        if (n < (1U << 21)) {
            data[0] = (n >> 14) | 0x80;
            data[1] = ((n >> 7) & 0x7f) | 0x80;
            data[2] = n & 0x7f;
            return 3;
        } else {
            data[0] = (n >> 21) | 0x80;
            data[1] = ((n >> 14) & 0x7f) | 0x80;
            data[2] = ((n >> 7) & 0x7f) | 0x80;
            data[3] = n & 0x7f;
            return 4;
        }
    } else {
        data[0] = (n >> 28) | 0x80;
        data[1] = ((n >> 21) & 0x7f) | 0x80;
        data[2] = ((n >> 14) & 0x7f) | 0x80;
        data[3] = ((n >> 7) & 0x7f) | 0x80;
        data[4] = n & 0x7f;
        return 5;
    }
}

我通过在0..UINT_MAX的紧密循环中执行它来测试性能。在我的系统上，执行时间是：

(Lower is better)
Original: 100%
caf's unrolled version: 79%
My version: 57%

一些细微的调整可能会产生更好的结果，但我怀疑除非你去集会，否则你会得到更多的改善。如果您的整数倾向于在特定范围内，那么您可以使用分析来使编译器将正确的分支预测添加到每个分支。这可能会让你获得一些额外的百分点速度。（编辑：我从重新排序分支中得到8％，但这是一个反常的优化，因为它依赖于每个数字0 ... UINT_MAX以相同频率出现的事实。我不建议这样做。）

SSE无济于事。 SSE被设计为同时对具有相同宽度的多个数据进行操作，众所周知难以使SIMD通过可变长度编码来加速任何事物。（这不一定是不可能的，但可能是不可能的，你必须非常聪明才能弄明白。）

Answer 2

如果您的unsigned int值限制在特定范围（例如32位），则可以展开循环：

size_t
compress_unsigned_int(unsigned int n, char* data)
{
  size_t size;

  if (n < 0x00000080U) {
    size = 1;
    goto b1;
  }
  if (n < 0x00004000U) {
    size = 2;
    goto b2;
  }
  if (n < 0x00200000U) {
    size = 3;
    goto b3;
  }
  if (n < 0x10000000U) {
    size = 4;
    goto b4;
  }
  size = 5;

  *data++ = (n & 0x7f) | 0x80;
  n >>= 7;
b4:
  *data++ = (n & 0x7f) | 0x80;
  n >>= 7;
b3:
  *data++ = (n & 0x7f) | 0x80;
  n >>= 7;
b2:
  *data++ = (n & 0x7f) | 0x80;
  n >>= 7;
b1:
  *data = n;
  return size;
}

Answer 3

您可能会在Google协议缓冲区中找到快速实施：

http://code.google.com/p/protobuf/

查看CodedOutputStream :: WriteVarintXXX方法。

第一种方法可能会重写为：

char *start = data;
while (n>=0x80)
{
    *data++=(n|0x80);
    n>>=7;
}
*data++=n;
return data-start;

根据我的测试google缓冲区实现是最好的，然后来其他实现。然而，我的测试相当人为，最好在您的应用程序中测试每种方法并选择最佳方法。提出的优化在特定数值上更有效。

这是我的测试应用程序的代码。（请注意，我已从compress_unsigned_int_google_buf中删除了代码。您可以在以下文件中找到google buffer protocol中的实现：coded_stream.cc方法CodedOutputStream :: WriteVarint32FallbackToArrayInline）

size_t compress_unsigned_int(unsigned int n, char* data)
{
    size_t size = 0;
    while (n  > 127)
    {
        ++size;
        *data++ = (n & 127)|128;
        n >>= 7;
    }
    *data++ = n;
    return ++size;
}

size_t compress_unsigned_int_improved(unsigned int n, char* data)
{
    size_t size;

    if (n < 0x00000080U) {
        size = 1;
        goto b1;
    }
    if (n < 0x00004000U) {
        size = 2;
        goto b2;
    }
    if (n < 0x00200000U) {
        size = 3;
        goto b3;
    }
    if (n < 0x10000000U) {
        size = 4;
        goto b4;
    }
    size = 5;

    *data++ = (n & 0x7f) | 0x80;
    n >>= 7;
b4:
    *data++ = (n & 0x7f) | 0x80;
    n >>= 7;
b3:
    *data++ = (n & 0x7f) | 0x80;
    n >>= 7;
b2:
    *data++ = (n & 0x7f) | 0x80;
    n >>= 7;
b1:
    *data = n;
    return size;
}

size_t compress_unsigned_int_more_improved(unsigned int n, char *data)
{
    if (n < (1U << 14)) {
        if (n < (1U << 7)) {
            data[0] = n;
            return 1;
        } else {
            data[0] = (n >> 7) | 0x80;
            data[1] = n & 0x7f;
            return 2;
        }
    } else if (n < (1U << 28)) {
        if (n < (1U << 21)) {
            data[0] = (n >> 14) | 0x80;
            data[1] = ((n >> 7) & 0x7f) | 0x80;
            data[2] = n & 0x7f;
            return 3;
        } else {
            data[0] = (n >> 21) | 0x80;
            data[1] = ((n >> 14) & 0x7f) | 0x80;
            data[2] = ((n >> 7) & 0x7f) | 0x80;
            data[3] = n & 0x7f;
            return 4;
        }
    } else {
        data[0] = (n >> 28) | 0x80;
        data[1] = ((n >> 21) & 0x7f) | 0x80;
        data[2] = ((n >> 14) & 0x7f) | 0x80;
        data[3] = ((n >> 7) & 0x7f) | 0x80;
        data[4] = n & 0x7f;
        return 5;
    }
}

size_t compress_unsigned_int_simple(unsigned int n, char *data)
{
    char *start = data;
    while (n>=0x80)
    {
        *data++=(n|0x80);
        n>>=7;
    }
    *data++=n;
    return data-start;
}

inline size_t compress_unsigned_int_google_buf(unsigned int value, unsigned char* target) {

          // This implementation might be found in google protocol buffers

}



#include <iostream>
#include <Windows.h>
using namespace std;

int _tmain(int argc, _TCHAR* argv[])
{
    char data[20];
    unsigned char udata[20];
    size_t size = 0;
    __int64 timer;

    cout << "Plain copy: ";

    timer = GetTickCount64();

    size = 0;

    for (int i=0; i<536870900; i++)
    {
        memcpy(data,&i,sizeof(i));
        size += sizeof(i);
    }

    cout << GetTickCount64() - timer << " Size: " << size <<  endl;

    cout << "Original: ";

    timer = GetTickCount64();

    size = 0;

    for (int i=0; i<536870900; i++)
    {
        size += compress_unsigned_int(i,data);
    }

    cout << GetTickCount64() - timer << " Size: " << size << endl;

    cout << "Improved: ";

    timer = GetTickCount64();

    size = 0;

    for (int i=0; i<536870900; i++)
    {
        size += compress_unsigned_int_improved(i,data);
    }

    cout << GetTickCount64() - timer << " Size: " << size <<  endl;

    cout << "More Improved: ";

    timer = GetTickCount64();

    size = 0;

    for (int i=0; i<536870900; i++)
    {
        size += compress_unsigned_int_more_improved(i,data);
    }

    cout << GetTickCount64() - timer << " Size: " << size <<  endl;

    cout << "Simple: ";

    timer = GetTickCount64();

    size = 0;

    for (int i=0; i<536870900; i++)
    {
        size += compress_unsigned_int_simple(i,data);
    }

    cout << GetTickCount64() - timer << " Size: " << size <<  endl;

    cout << "Google Buffers: ";

    timer = GetTickCount64();

    size = 0;

    for (int i=0; i<536870900; i++)
    {
        size += compress_unsigned_int_google_buf(i,udata);
    }

    cout << GetTickCount64() - timer << " Size: " << size <<  endl;

    return 0;
}

在使用Visual C ++编译器的机器上，我得到了以下结果：

普通副本：358毫秒

原文：2497 ms

改进：2215毫秒

更多改进：2231毫秒

简单：2059毫秒

Google Buffers：968 ms

Answer 4

经过更多浏览后，我在Sqlite3中找到了另一个常用的实现（代码版本3070900）：

inline int sqlite3PutVarint(unsigned char *p, unsigned __int64 v){
  int i, j, n;
  unsigned char buf[10];
  if( v & (((unsigned __int64)0xff000000)<<32) ){
    p[8] = (unsigned char)v;
    v >>= 8;
    for(i=7; i>=0; i--){
      p[i] = (unsigned char)((v & 0x7f) | 0x80);
      v >>= 7;
    }
    return 9;
  }    
  n = 0;
  do{
    buf[n++] = (unsigned char)((v & 0x7f) | 0x80);
    v >>= 7;
  }while( v!=0 );
  buf[0] &= 0x7f;
  for(i=0, j=n-1; j>=0; j--, i++){
    p[i] = buf[j];
  }
  return n;
}

还有针对32位int的稍微优化的版本：

int sqlite3PutVarint32(unsigned char *p, unsigned int v){

  if( (v & ~0x7f)==0 ){
    p[0] = v;
    return 1;
  }

  if( (v & ~0x3fff)==0 ){
    p[0] = (unsigned char)((v>>7) | 0x80);
    p[1] = (unsigned char)(v & 0x7f);
    return 2;
  }
  return sqlite3PutVarint(p, v);
}

令人失望的是，Sqlite实现在我的测试中表现最差。因此，如果您打算使用Sqlite，请考虑使用优化的替换默认实现。

与此同时，我正在考虑进一步可能的优化。

Answer 5

您可以通过替换
来保存一些操作 size_t size=0;...++size;...;return size++;与
char* base=data;...;return data-base;

Answer 6

这是我在x86汇编语言（32位）中的优化。您可以使用NASM进行编译并进行链接。我不知道它是快还是慢，我只是很开心编码：）

global compress_unsigned_int

;   bit fields:
;   31                              0
;    eeeedddddddcccccccbbbbbbbaaaaaaa


compress_unsigned_int:
    mov     eax, [esp+4]    ; n
    mov     ecx, [esp+8]    ; data

    cmp     eax, 00001111111111111111111111111111b
    jbe     out4b

    shld    edx, eax, 11
    shl     eax, 10
    shld    edx, eax, 8
    shl     eax, 7
    shld    edx, eax, 8
    shl     eax, 7
    shld    edx, eax, 8
    or      edx, 10000000100000001000000010000000b

    mov     [ecx], edx
    mov     eax, [esp+4]
    shr     eax, 28
    mov     [ecx+4], al

    mov     eax, 5
    jmp     exit

out4b:
    cmp     eax, 00000000000111111111111111111111b
    jbe     out3b

    shld    edx, eax, 11
    shl     eax, 10
    shld    edx, eax, 8
    shl     eax, 7
    shld    edx, eax, 8
    shl     eax, 7
    shld    edx, eax, 8
    or      edx, 00000000100000001000000010000000b

    mov     [ecx], edx

    mov     eax, 4
    jmp     exit

out3b:
    cmp     eax, 00000000000000000011111111111111b
    jbe     out2b

    shld    edx, eax, 25
    shl     eax, 24
    shld    edx, eax, 8

    mov     eax, edx

    or      edx, 00000000000000001000000010000000b

    mov     [ecx], dx
    shr     eax, 15
    mov     [ecx+2], al

    mov     eax, 3
    jmp     exit

out2b:
    cmp     eax, 00000000000000000000000001111111b
    jbe     out1b

    shld    edx, eax, 25
    shl     eax, 24
    shld    edx, eax, 8
    or      edx, 00000000000000000000000010000000b

    mov     [ecx], dx

    mov     eax, 2
    jmp     exit

out1b:
    mov     [ecx], al

    mov     eax, 1

exit:
    ret

优化可变长度编码

6 个答案: