编写了自己的位计数例程后,我偶然发现了__builtin_popcount for gcc。但是,当我切换到__builtin_popcount时,我的软件实际上运行缓慢。我正在使用2.90 GHz Intel Core i3-4130T CPU上的Unbutu。我进行了性能测试,以了解效果如何。看起来像这样:
#include <iostream>
#include <sys/time.h>
#include <stdint.h>
using namespace std;
const int bitCount[256] = {
0,1,1,2,1,2,2,3, 1,2,2,3,2,3,3,4, 1,2,2,3,2,3,3,4, 2,3,3,4,3,4,4,5,
1,2,2,3,2,3,3,4, 2,3,3,4,3,4,4,5, 2,3,3,4,3,4,4,5, 3,4,4,5,4,5,5,6,
1,2,2,3,2,3,3,4, 2,3,3,4,3,4,4,5, 2,3,3,4,3,4,4,5, 3,4,4,5,4,5,5,6,
2,3,3,4,3,4,4,5, 3,4,4,5,4,5,5,6, 3,4,4,5,4,5,5,6, 4,5,5,6,5,6,6,7,
1,2,2,3,2,3,3,4, 2,3,3,4,3,4,4,5, 2,3,3,4,3,4,4,5, 3,4,4,5,4,5,5,6,
2,3,3,4,3,4,4,5, 3,4,4,5,4,5,5,6, 3,4,4,5,4,5,5,6, 4,5,5,6,5,6,6,7,
2,3,3,4,3,4,4,5, 3,4,4,5,4,5,5,6, 3,4,4,5,4,5,5,6, 4,5,5,6,5,6,6,7,
3,4,4,5,4,5,5,6, 4,5,5,6,5,6,6,7, 4,5,5,6,5,6,6,7, 5,6,6,7,6,7,7,8
};
const uint32_t m32_0001 = 0x000000ffu;
const uint32_t m32_0010 = 0x0000ff00u;
const uint32_t m32_0100 = 0x00ff0000u;
const uint32_t m32_1000 = 0xff000000u;
inline int countBits(uint32_t bitField)
{
return
bitCount[(bitField & m32_0001) ] +
bitCount[(bitField & m32_0010) >> 8] +
bitCount[(bitField & m32_0100) >> 16] +
bitCount[(bitField & m32_1000) >> 24];
}
inline long long currentTime() {
struct timeval ct;
gettimeofday(&ct, NULL);
return ct.tv_sec * 1000000LL + ct.tv_usec;
}
int main() {
long long start, delta, sum;
start = currentTime();
sum = 0;
for(unsigned i = 0; i < 100000000; ++i)
sum += countBits(i);
delta = currentTime() - start;
cout << "countBits : sum=" << sum << ": time (usec)=" << delta << endl;
start = currentTime();
sum = 0;
for(unsigned i = 0; i < 100000000; ++i)
sum += __builtin_popcount(i);
delta = currentTime() - start;
cout << "__builtin_popcount: sum=" << sum << ": time (usec)=" << delta << endl;
start = currentTime();
sum = 0;
for(unsigned i = 0; i < 100000000; ++i) {
int count;
asm("popcnt %1,%0" : "=r"(count) : "rm"(i) : "cc");
sum += count;
}
delta = currentTime() - start;
cout << "assembler : sum=" << sum << ": time (usec)=" << delta << endl;
return 0;
}
起初,我是使用较旧的编译器运行的:
> g++ --version | head -1
g++ (Ubuntu 4.8.4-2ubuntu1~14.04.3) 4.8.4
> cat /proc/cpuinfo | grep 'model name' | head -1
model name : Intel(R) Core(TM) i3-4130T CPU @ 2.90GHz
> g++ -O3 popcountTest.cpp
> ./a.out
countBits : sum=1314447104: time (usec)=148506
__builtin_popcount: sum=1314447104: time (usec)=345122
assembler : sum=1314447104: time (usec)=138036
如您所见,基于表的countBits几乎与汇编程序一样快,并且比__builtin_popcount快得多。然后我在不同的机器类型(相同的处理器上,并且我认为主板也一样)上尝试了较新的编译器:
> g++ --version | head -1
g++ (Ubuntu 7.3.0-16ubuntu3) 7.3.0
> cat /proc/cpuinfo | grep 'model name' | head -1
model name : Intel(R) Core(TM) i3-4130T CPU @ 2.90GHz
> g++ -O3 popcountTest.cpp
> ./a.out
countBits : sum=1314447104: time (usec)=164247
__builtin_popcount: sum=1314447104: time (usec)=345167
assembler : sum=1314447104: time (usec)=138028
奇怪的是,较旧的编译器对我的countBits函数的优化要比较新的编译器要好,但与汇编程序相比仍然具有优势。显然,由于汇编程序行可以编译和运行,因此我的处理器支持popcount,但是为什么__builtin_popcount慢两倍以上?我自己的例程又如何与基于硅的弹出式表盘竞争?我在寻找第一个置位位的其他例程方面也有相同的经验。我的例程都比GNU“内置”等效函数快得多。
(顺便说一句,我不知道如何编写汇编程序。我只是在某些网页上发现了这一行,而且它似乎行之有效。)
答案 0 :(得分:5)
如果没有在命令行上指定适当的“ -march”,则gcc会生成对__popcountdi2
函数而不是popcnt
指令的调用。参见:https://godbolt.org/z/z1BihM
答案 1 :(得分:1)
我认为在将-march = native添加到编译行(如Mat和Alan Birtles所建议)后,共享新的性能结果可能是有用的,这可以使用popcount机器指令。结果因编译器版本而异。这是较旧的编译器:
> g++ --version | head -1
g++ (Ubuntu 4.8.4-2ubuntu1~14.04.3) 4.8.4
> cat /proc/cpuinfo | grep 'model name' | head -1
model name : Intel(R) Core(TM) i3-4130T CPU @ 2.90GHz
> g++ -march=native -O3 popcountTest.cpp
> ./a.out
countBits : sum=1314447104: time (usec)=163947
__builtin_popcount: sum=1314447104: time (usec)=138046
assembler : sum=1314447104: time (usec)=138036
这是更新的编译器:
> g++ --version | head -1
g++ (Ubuntu 7.3.0-16ubuntu3) 7.3.0
> cat /proc/cpuinfo | grep 'model name' | head -1
model name : Intel(R) Core(TM) i3-4130T CPU @ 2.90GHz
> g++ -march=native -O3 popcountTest.cpp
> ./a.out
countBits : sum=1314447104: time (usec)=163133
__builtin_popcount: sum=1314447104: time (usec)=73987
assembler : sum=1314447104: time (usec)=138036
观察:
在旧版g ++的命令行中添加-march = native 编译器将__builtin_popcount的性能提高到相等 汇编程序,并将我的countbits例程降低了大约15%。
在较新的g ++的命令行中添加-march = native 编译器导致__builtin_popcount的性能超过 汇编程序。我认为这与 我不确定与汇编程序一起使用的堆栈变量。那里 对我的countBits性能没有影响(正如我所说的 问题,使用这种较新的编译器已经慢了。)