我有一个案例,我需要压缩很多通常很小的值。因此,我使用可变长度字节编码(ULEB128来压缩它们,具体而言):
size_t
compress_unsigned_int(unsigned int n, char* data)
{
size_t size = 0;
while (n > 127)
{
++size;
*data++ = (n & 127)|128;
n >>= 7;
}
*data++ = n;
return ++size;
}
是否有更有效的方法(可能使用SSE)?
编辑:经过此压缩后,结果将存储到data
,并占用size
个字节。然后,在下一个unsigned int上调用压缩函数。
答案 0 :(得分:8)
您要做的第一件事是针对当前代码测试任何可能的解决方案。
我想你可能想尝试摆脱数据依赖,让处理器同时做更多的工作。
什么是数据依赖?当数据流过您的函数时,n
的当前值取决于n
的先前值,这取决于之前的值...这是一个长链数据依赖。在下面的代码中,n
永远不会被修改,因此处理器可以“跳过”并同时执行几个不同的操作,而无需等待计算新的n
。
// NOTE: This code is actually incorrect, as caf noted.
// The byte order is reversed.
size_t
compress_unsigned_int(unsigned int n, char *data)
{
if (n < (1U << 14)) {
if (n < (1U << 7)) {
data[0] = n;
return 1;
} else {
data[0] = (n >> 7) | 0x80;
data[1] = n & 0x7f;
return 2;
}
} else if (n < (1U << 28)) {
if (n < (1U << 21)) {
data[0] = (n >> 14) | 0x80;
data[1] = ((n >> 7) & 0x7f) | 0x80;
data[2] = n & 0x7f;
return 3;
} else {
data[0] = (n >> 21) | 0x80;
data[1] = ((n >> 14) & 0x7f) | 0x80;
data[2] = ((n >> 7) & 0x7f) | 0x80;
data[3] = n & 0x7f;
return 4;
}
} else {
data[0] = (n >> 28) | 0x80;
data[1] = ((n >> 21) & 0x7f) | 0x80;
data[2] = ((n >> 14) & 0x7f) | 0x80;
data[3] = ((n >> 7) & 0x7f) | 0x80;
data[4] = n & 0x7f;
return 5;
}
}
我通过在0..UINT_MAX的紧密循环中执行它来测试性能。在我的系统上,执行时间是:
(Lower is better)
Original: 100%
caf's unrolled version: 79%
My version: 57%
一些细微的调整可能会产生更好的结果,但我怀疑除非你去集会,否则你会得到更多的改善。如果您的整数倾向于在特定范围内,那么您可以使用分析来使编译器将正确的分支预测添加到每个分支。这可能会让你获得一些额外的百分点速度。 (编辑:我从重新排序分支中得到8%,但这是一个反常的优化,因为它依赖于每个数字0 ... UINT_MAX以相同频率出现的事实。我不建议这样做。 )
SSE无济于事。 SSE被设计为同时对具有相同宽度的多个数据进行操作,众所周知难以使SIMD通过可变长度编码来加速任何事物。 (这不一定是不可能的,但可能是不可能的,你必须非常聪明才能弄明白。)
答案 1 :(得分:3)
如果您的unsigned int
值限制在特定范围(例如32位),则可以展开循环:
size_t
compress_unsigned_int(unsigned int n, char* data)
{
size_t size;
if (n < 0x00000080U) {
size = 1;
goto b1;
}
if (n < 0x00004000U) {
size = 2;
goto b2;
}
if (n < 0x00200000U) {
size = 3;
goto b3;
}
if (n < 0x10000000U) {
size = 4;
goto b4;
}
size = 5;
*data++ = (n & 0x7f) | 0x80;
n >>= 7;
b4:
*data++ = (n & 0x7f) | 0x80;
n >>= 7;
b3:
*data++ = (n & 0x7f) | 0x80;
n >>= 7;
b2:
*data++ = (n & 0x7f) | 0x80;
n >>= 7;
b1:
*data = n;
return size;
}
答案 2 :(得分:3)
您可能会在Google协议缓冲区中找到快速实施:
http://code.google.com/p/protobuf/
查看CodedOutputStream :: WriteVarintXXX方法。
第一种方法可能会重写为:
char *start = data;
while (n>=0x80)
{
*data++=(n|0x80);
n>>=7;
}
*data++=n;
return data-start;
根据我的测试google缓冲区实现是最好的,然后来其他实现。然而,我的测试相当人为,最好在您的应用程序中测试每种方法并选择最佳方法。提出的优化在特定数值上更有效。
这是我的测试应用程序的代码。 (请注意,我已从compress_unsigned_int_google_buf中删除了代码。您可以在以下文件中找到google buffer protocol中的实现:coded_stream.cc方法CodedOutputStream :: WriteVarint32FallbackToArrayInline)
size_t compress_unsigned_int(unsigned int n, char* data)
{
size_t size = 0;
while (n > 127)
{
++size;
*data++ = (n & 127)|128;
n >>= 7;
}
*data++ = n;
return ++size;
}
size_t compress_unsigned_int_improved(unsigned int n, char* data)
{
size_t size;
if (n < 0x00000080U) {
size = 1;
goto b1;
}
if (n < 0x00004000U) {
size = 2;
goto b2;
}
if (n < 0x00200000U) {
size = 3;
goto b3;
}
if (n < 0x10000000U) {
size = 4;
goto b4;
}
size = 5;
*data++ = (n & 0x7f) | 0x80;
n >>= 7;
b4:
*data++ = (n & 0x7f) | 0x80;
n >>= 7;
b3:
*data++ = (n & 0x7f) | 0x80;
n >>= 7;
b2:
*data++ = (n & 0x7f) | 0x80;
n >>= 7;
b1:
*data = n;
return size;
}
size_t compress_unsigned_int_more_improved(unsigned int n, char *data)
{
if (n < (1U << 14)) {
if (n < (1U << 7)) {
data[0] = n;
return 1;
} else {
data[0] = (n >> 7) | 0x80;
data[1] = n & 0x7f;
return 2;
}
} else if (n < (1U << 28)) {
if (n < (1U << 21)) {
data[0] = (n >> 14) | 0x80;
data[1] = ((n >> 7) & 0x7f) | 0x80;
data[2] = n & 0x7f;
return 3;
} else {
data[0] = (n >> 21) | 0x80;
data[1] = ((n >> 14) & 0x7f) | 0x80;
data[2] = ((n >> 7) & 0x7f) | 0x80;
data[3] = n & 0x7f;
return 4;
}
} else {
data[0] = (n >> 28) | 0x80;
data[1] = ((n >> 21) & 0x7f) | 0x80;
data[2] = ((n >> 14) & 0x7f) | 0x80;
data[3] = ((n >> 7) & 0x7f) | 0x80;
data[4] = n & 0x7f;
return 5;
}
}
size_t compress_unsigned_int_simple(unsigned int n, char *data)
{
char *start = data;
while (n>=0x80)
{
*data++=(n|0x80);
n>>=7;
}
*data++=n;
return data-start;
}
inline size_t compress_unsigned_int_google_buf(unsigned int value, unsigned char* target) {
// This implementation might be found in google protocol buffers
}
#include <iostream>
#include <Windows.h>
using namespace std;
int _tmain(int argc, _TCHAR* argv[])
{
char data[20];
unsigned char udata[20];
size_t size = 0;
__int64 timer;
cout << "Plain copy: ";
timer = GetTickCount64();
size = 0;
for (int i=0; i<536870900; i++)
{
memcpy(data,&i,sizeof(i));
size += sizeof(i);
}
cout << GetTickCount64() - timer << " Size: " << size << endl;
cout << "Original: ";
timer = GetTickCount64();
size = 0;
for (int i=0; i<536870900; i++)
{
size += compress_unsigned_int(i,data);
}
cout << GetTickCount64() - timer << " Size: " << size << endl;
cout << "Improved: ";
timer = GetTickCount64();
size = 0;
for (int i=0; i<536870900; i++)
{
size += compress_unsigned_int_improved(i,data);
}
cout << GetTickCount64() - timer << " Size: " << size << endl;
cout << "More Improved: ";
timer = GetTickCount64();
size = 0;
for (int i=0; i<536870900; i++)
{
size += compress_unsigned_int_more_improved(i,data);
}
cout << GetTickCount64() - timer << " Size: " << size << endl;
cout << "Simple: ";
timer = GetTickCount64();
size = 0;
for (int i=0; i<536870900; i++)
{
size += compress_unsigned_int_simple(i,data);
}
cout << GetTickCount64() - timer << " Size: " << size << endl;
cout << "Google Buffers: ";
timer = GetTickCount64();
size = 0;
for (int i=0; i<536870900; i++)
{
size += compress_unsigned_int_google_buf(i,udata);
}
cout << GetTickCount64() - timer << " Size: " << size << endl;
return 0;
}
在使用Visual C ++编译器的机器上,我得到了以下结果:
普通副本:358毫秒
原文:2497 ms
改进:2215毫秒
更多改进:2231毫秒
简单:2059毫秒
Google Buffers:968 ms
答案 3 :(得分:2)
经过更多浏览后,我在Sqlite3中找到了另一个常用的实现(代码版本3070900):
inline int sqlite3PutVarint(unsigned char *p, unsigned __int64 v){
int i, j, n;
unsigned char buf[10];
if( v & (((unsigned __int64)0xff000000)<<32) ){
p[8] = (unsigned char)v;
v >>= 8;
for(i=7; i>=0; i--){
p[i] = (unsigned char)((v & 0x7f) | 0x80);
v >>= 7;
}
return 9;
}
n = 0;
do{
buf[n++] = (unsigned char)((v & 0x7f) | 0x80);
v >>= 7;
}while( v!=0 );
buf[0] &= 0x7f;
for(i=0, j=n-1; j>=0; j--, i++){
p[i] = buf[j];
}
return n;
}
还有针对32位int的稍微优化的版本:
int sqlite3PutVarint32(unsigned char *p, unsigned int v){
if( (v & ~0x7f)==0 ){
p[0] = v;
return 1;
}
if( (v & ~0x3fff)==0 ){
p[0] = (unsigned char)((v>>7) | 0x80);
p[1] = (unsigned char)(v & 0x7f);
return 2;
}
return sqlite3PutVarint(p, v);
}
令人失望的是,Sqlite实现在我的测试中表现最差。因此,如果您打算使用Sqlite,请考虑使用优化的替换默认实现。
与此同时,我正在考虑进一步可能的优化。
答案 4 :(得分:0)
您可以通过替换
来保存一些操作
size_t size=0;...++size;...;return size++;
与
char* base=data;...;return data-base;
答案 5 :(得分:0)
这是我在x86汇编语言(32位)中的优化。您可以使用NASM进行编译并进行链接。我不知道它是快还是慢,我只是很开心编码:)
global compress_unsigned_int
; bit fields:
; 31 0
; eeeedddddddcccccccbbbbbbbaaaaaaa
compress_unsigned_int:
mov eax, [esp+4] ; n
mov ecx, [esp+8] ; data
cmp eax, 00001111111111111111111111111111b
jbe out4b
shld edx, eax, 11
shl eax, 10
shld edx, eax, 8
shl eax, 7
shld edx, eax, 8
shl eax, 7
shld edx, eax, 8
or edx, 10000000100000001000000010000000b
mov [ecx], edx
mov eax, [esp+4]
shr eax, 28
mov [ecx+4], al
mov eax, 5
jmp exit
out4b:
cmp eax, 00000000000111111111111111111111b
jbe out3b
shld edx, eax, 11
shl eax, 10
shld edx, eax, 8
shl eax, 7
shld edx, eax, 8
shl eax, 7
shld edx, eax, 8
or edx, 00000000100000001000000010000000b
mov [ecx], edx
mov eax, 4
jmp exit
out3b:
cmp eax, 00000000000000000011111111111111b
jbe out2b
shld edx, eax, 25
shl eax, 24
shld edx, eax, 8
mov eax, edx
or edx, 00000000000000001000000010000000b
mov [ecx], dx
shr eax, 15
mov [ecx+2], al
mov eax, 3
jmp exit
out2b:
cmp eax, 00000000000000000000000001111111b
jbe out1b
shld edx, eax, 25
shl eax, 24
shld edx, eax, 8
or edx, 00000000000000000000000010000000b
mov [ecx], dx
mov eax, 2
jmp exit
out1b:
mov [ecx], al
mov eax, 1
exit:
ret