在我编写自己的位计数例程之后,我偶然发现了gcc中的__builtin_popcount。但是当我切换到__builtin_popcount后,我的软件实际上运行得更慢了。我使用的是Intel Core i3-4130T CPU @ 2.90GHz上的Unbutu操作系统。我建立了一个性能测试以查看其中原因。测试代码如下:
#include <iostream>
#include <sys/time.h>
#include <stdint.h>
using namespace std;
const int bitCount[256] = {
0,1,1,2,1,2,2,3, 1,2,2,3,2,3,3,4, 1,2,2,3,2,3,3,4, 2,3,3,4,3,4,4,5,
1,2,2,3,2,3,3,4, 2,3,3,4,3,4,4,5, 2,3,3,4,3,4,4,5, 3,4,4,5,4,5,5,6,
1,2,2,3,2,3,3,4, 2,3,3,4,3,4,4,5, 2,3,3,4,3,4,4,5, 3,4,4,5,4,5,5,6,
2,3,3,4,3,4,4,5, 3,4,4,5,4,5,5,6, 3,4,4,5,4,5,5,6, 4,5,5,6,5,6,6,7,
1,2,2,3,2,3,3,4, 2,3,3,4,3,4,4,5, 2,3,3,4,3,4,4,5, 3,4,4,5,4,5,5,6,
2,3,3,4,3,4,4,5, 3,4,4,5,4,5,5,6, 3,4,4,5,4,5,5,6, 4,5,5,6,5,6,6,7,
2,3,3,4,3,4,4,5, 3,4,4,5,4,5,5,6, 3,4,4,5,4,5,5,6, 4,5,5,6,5,6,6,7,
3,4,4,5,4,5,5,6, 4,5,5,6,5,6,6,7, 4,5,5,6,5,6,6,7, 5,6,6,7,6,7,7,8
};
const uint32_t m32_0001 = 0x000000ffu;
const uint32_t m32_0010 = 0x0000ff00u;
const uint32_t m32_0100 = 0x00ff0000u;
const uint32_t m32_1000 = 0xff000000u;
inline int countBits(uint32_t bitField)
{
return
bitCount[(bitField & m32_0001) ] +
bitCount[(bitField & m32_0010) >> 8] +
bitCount[(bitField & m32_0100) >> 16] +
bitCount[(bitField & m32_1000) >> 24];
}
inline long long currentTime() {
struct timeval ct;
gettimeofday(&ct, NULL);
return ct.tv_sec * 1000000LL + ct.tv_usec;
}
int main() {
long long start, delta, sum;
start = currentTime();
sum = 0;
for(unsigned i = 0; i < 100000000; ++i)
sum += countBits(i);
delta = currentTime() - start;
cout << "countBits : sum=" << sum << ": time (usec)=" << delta << endl;
start = currentTime();
sum = 0;
for(unsigned i = 0; i < 100000000; ++i)
sum += __builtin_popcount(i);
delta = currentTime() - start;
cout << "__builtin_popcount: sum=" << sum << ": time (usec)=" << delta << endl;
start = currentTime();
sum = 0;
for(unsigned i = 0; i < 100000000; ++i) {
int count;
asm("popcnt %1,%0" : "=r"(count) : "rm"(i) : "cc");
sum += count;
}
delta = currentTime() - start;
cout << "assembler : sum=" << sum << ": time (usec)=" << delta << endl;
return 0;
}
起初我是用旧版本的编译器运行的:
> g++ --version | head -1
g++ (Ubuntu 4.8.4-2ubuntu1~14.04.3) 4.8.4
> cat /proc/cpuinfo | grep 'model name' | head -1
model name : Intel(R) Core(TM) i3-4130T CPU @ 2.90GHz
> g++ -O3 popcountTest.cpp
> ./a.out
countBits : sum=1314447104: time (usec)=148506
__builtin_popcount: sum=1314447104: time (usec)=345122
assembler : sum=1314447104: time (usec)=138036
正如您所看到的,基于表格的countBits函数几乎与汇编语言相同的速度,并且比__builtin_popcount快得多。然后我在不同型号的机器上尝试了一个更新的编译器(相同的处理器 - 我认为主板也是相同的):
As you can see, 基于表格的 countBits 函数几乎与汇编语言相同的速度,并且比 __builtin_popcount 快得多. Then I tried a newer compiler on a different machine type (same processor -- and I think the mother board's the same too):> g++ --version | head -1
g++ (Ubuntu 7.3.0-16ubuntu3) 7.3.0
> cat /proc/cpuinfo | grep 'model name' | head -1
model name : Intel(R) Core(TM) i3-4130T CPU @ 2.90GHz
> g++ -O3 popcountTest.cpp
> ./a.out
countBits : sum=1314447104: time (usec)=164247
__builtin_popcount: sum=1314447104: time (usec)=345167
assembler : sum=1314447104: time (usec)=138028
有趣的是,旧编译器优化了我的countBits函数,比新编译器更好,但与汇编程序相比仍然表现出色。显然,由于汇编行可以编译和运行,我使用的处理器支持popcount,但为什么__builtin_popcount却慢了两倍以上?我的程序如何可能与基于硅的popcount竞争呢?对于查找第一个设置位等其他例程,我也有同样的体验。我的程序都比GNU“内置”等效程序快得多。
(顺便说一句,我不知道如何编写汇编程序。我只是在某个网页上找到了那一行代码,它奇迹般地运行了。)
-march=native
。 - Mat-march=native
做到了!现在,__builtin_popcount 的速度与我的示例中的汇编代码完全相同。我仍然觉得奇怪的是,我的自定义程序只比它慢了20%。popcount 必须消耗很多时钟周期。 - Matthew Busche