我已经对几个 64 位“最高位”的实现进行了基准测试。最不需要分支的代码并不是最快的。
这是我的highest-bit.c
源文件:
int highest_bit_unrolled(unsigned long long n)
{
if (n & 0xFFFFFFFF00000000) {
if (n & 0xFFFF000000000000) {
if (n & 0xFF00000000000000) {
if (n & 0xF000000000000000) {
if (n & 0xC000000000000000)
return (n & 0x8000000000000000) ? 64 : 63;
else
return (n & 0x2000000000000000) ? 62 : 61;
} else {
if (n & 0x0C00000000000000)
return (n & 0x0800000000000000) ? 60 : 59;
else
return (n & 0x0200000000000000) ? 58 : 57;
}
} else {
if (n & 0x00F0000000000000) {
if (n & 0x00C0000000000000)
return (n & 0x0080000000000000) ? 56 : 55;
else
return (n & 0x0020000000000000) ? 54 : 53;
} else {
if (n & 0x000C000000000000)
return (n & 0x0008000000000000) ? 52 : 51;
else
return (n & 0x0002000000000000) ? 50 : 49;
}
}
} else {
if (n & 0x0000FF0000000000) {
if (n & 0x0000F00000000000) {
if (n & 0x0000C00000000000)
return (n & 0x0000800000000000) ? 48 : 47;
else
return (n & 0x0000200000000000) ? 46 : 45;
} else {
if (n & 0x00000C0000000000)
return (n & 0x0000080000000000) ? 44 : 43;
else
return (n & 0x0000020000000000) ? 42 : 41;
}
} else {
if (n & 0x000000F000000000) {
if (n & 0x000000C000000000)
return (n & 0x0000008000000000) ? 40 : 39;
else
return (n & 0x0000002000000000) ? 38 : 37;
} else {
if (n & 0x0000000C00000000)
return (n & 0x0000000800000000) ? 36 : 35;
else
return (n & 0x0000000200000000) ? 34 : 33;
}
}
}
} else {
if (n & 0x00000000FFFF0000) {
if (n & 0x00000000FF000000) {
if (n & 0x00000000F0000000) {
if (n & 0x00000000C0000000)
return (n & 0x0000000080000000) ? 32 : 31;
else
return (n & 0x0000000020000000) ? 30 : 29;
} else {
if (n & 0x000000000C000000)
return (n & 0x0000000008000000) ? 28 : 27;
else
return (n & 0x0000000002000000) ? 26 : 25;
}
} else {
if (n & 0x0000000000F00000) {
if (n & 0x0000000000C00000)
return (n & 0x0000000000800000) ? 24 : 23;
else
return (n & 0x0000000000200000) ? 22 : 21;
} else {
if (n & 0x00000000000C0000)
return (n & 0x0000000000080000) ? 20 : 19;
else
return (n & 0x0000000000020000) ? 18 : 17;
}
}
} else {
if (n & 0x000000000000FF00) {
if (n & 0x000000000000F000) {
if (n & 0x000000000000C000)
return (n & 0x0000000000008000) ? 16 : 15;
else
return (n & 0x0000000000002000) ? 14 : 13;
} else {
if (n & 0x0000000000000C00)
return (n & 0x0000000000000800) ? 12 : 11;
else
return (n & 0x0000000000000200) ? 10 : 9;
}
} else {
if (n & 0x00000000000000F0) {
if (n & 0x00000000000000C0)
return (n & 0x0000000000000080) ? 8 : 7;
else
return (n & 0x0000000000000020) ? 6 : 5;
} else {
if (n & 0x000000000000000C)
return (n & 0x0000000000000008) ? 4 : 3;
else
return (n & 0x0000000000000002) ? 2 : (n ? 1 : 0);
}
}
}
}
}
int highest_bit_bs(unsigned long long n)
{
const unsigned long long mask[] = {
0x000000007FFFFFFF,
0x000000000000FFFF,
0x00000000000000FF,
0x000000000000000F,
0x0000000000000003,
0x0000000000000001
};
int hi = 64;
int lo = 0;
int i = 0;
if (n == 0)
return 0;
for (i = 0; i < sizeof mask / sizeof mask[0]; i++) {
int mi = lo + (hi - lo) / 2;
if ((n >> mi) != 0)
lo = mi;
else if ((n & (mask[i] << lo)) != 0)
hi = mi;
}
return lo + 1;
}
int highest_bit_shift(unsigned long long n)
{
int i = 0;
for (; n; n >>= 1, i++)
;
return i;
}
static int count_ones(unsigned long long d)
{
d = ((d & 0xAAAAAAAAAAAAAAAA) >> 1) + (d & 0x5555555555555555);
d = ((d & 0xCCCCCCCCCCCCCCCC) >> 2) + (d & 0x3333333333333333);
d = ((d & 0xF0F0F0F0F0F0F0F0) >> 4) + (d & 0x0F0F0F0F0F0F0F0F);
d = ((d & 0xFF00FF00FF00FF00) >> 8) + (d & 0x00FF00FF00FF00FF);
d = ((d & 0xFFFF0000FFFF0000) >> 16) + (d & 0x0000FFFF0000FFFF);
d = ((d & 0xFFFFFFFF00000000) >> 32) + (d & 0x00000000FFFFFFFF);
return d;
}
int highest_bit_parallel(unsigned long long n)
{
n |= n >> 1;
n |= n >> 2;
n |= n >> 4;
n |= n >> 8;
n |= n >> 16;
n |= n >> 32;
return count_ones(n);
}
int highest_bit_so(unsigned long long x)
{
static const unsigned long long t[6] = {
0xFFFFFFFF00000000ull,
0x00000000FFFF0000ull,
0x000000000000FF00ull,
0x00000000000000F0ull,
0x000000000000000Cull,
0x0000000000000002ull
};
int y = (((x & (x - 1)) == 0) ? 0 : 1);
int j = 32;
int i;
for (i = 0; i < 6; i++) {
int k = (((x & t[i]) == 0) ? 0 : j);
y += k;
x >>= k;
j >>= 1;
}
return y;
}
int highest_bit_so2(unsigned long long value)
{
int pos = 0;
if (value & (value - 1ULL))
{
pos = 1;
}
if (value & 0xFFFFFFFF00000000ULL)
{
pos += 32;
value = value >> 32;
}
if (value & 0x00000000FFFF0000ULL)
{
pos += 16;
value = value >> 16;
}
if (value & 0x000000000000FF00ULL)
{
pos += 8;
value = value >> 8;
}
if (value & 0x00000000000000F0ULL)
{
pos += 4;
value = value >> 4;
}
if (value & 0x000000000000000CULL)
{
pos += 2;
value = value >> 2;
}
if (value & 0x0000000000000002ULL)
{
pos += 1;
value = value >> 1;
}
return pos;
}
这是
highest-bit.h
文件:
int highest_bit_unrolled(unsigned long long n);
int highest_bit_bs(unsigned long long n);
int highest_bit_shift(unsigned long long n);
int highest_bit_parallel(unsigned long long n);
int highest_bit_so(unsigned long long n);
int highest_bit_so2(unsigned long long n);
以下是主程序(抱歉,需要复制粘贴):
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include "highest-bit.h"
double timedelta(clock_t start, clock_t end)
{
return (end - start)*1.0/CLOCKS_PER_SEC;
}
int main(int argc, char **argv)
{
int i;
volatile unsigned long long v;
clock_t start, end;
start = clock();
for (i = 0; i < 10000000; i++) {
for (v = 0x8000000000000000; v; v >>= 1)
highest_bit_unrolled(v);
}
end = clock();
printf("highest_bit_unrolled = %6.3fs\n", timedelta(start, end));
start = clock();
for (i = 0; i < 10000000; i++) {
for (v = 0x8000000000000000; v; v >>= 1)
highest_bit_parallel(v);
}
end = clock();
printf("highest_bit_parallel = %6.3fs\n", timedelta(start, end));
start = clock();
for (i = 0; i < 10000000; i++) {
for (v = 0x8000000000000000; v; v >>= 1)
highest_bit_bs(v);
}
end = clock();
printf("highest_bit_bs = %6.3fs\n", timedelta(start, end));
start = clock();
for (i = 0; i < 10000000; i++) {
for (v = 0x8000000000000000; v; v >>= 1)
highest_bit_shift(v);
}
end = clock();
printf("highest_bit_shift = %6.3fs\n", timedelta(start, end));
start = clock();
for (i = 0; i < 10000000; i++) {
for (v = 0x8000000000000000; v; v >>= 1)
highest_bit_so(v);
}
end = clock();
printf("highest_bit_so = %6.3fs\n", timedelta(start, end));
start = clock();
for (i = 0; i < 10000000; i++) {
for (v = 0x8000000000000000; v; v >>= 1)
highest_bit_so2(v);
}
end = clock();
printf("highest_bit_so2 = %6.3fs\n", timedelta(start, end));
return 0;
}
我已尝试了各种不同的Intel x86盒子,无论是新的还是旧的。
highest_bit_unrolled
(展开二进制搜索)始终比highest_bit_parallel
(无分支比特运算)快得多。这比highest_bit_bs
(二进制搜索循环)更快,而highest_bit_bs
又比highest_bit_shift
(朴素移位和计数循环)更快。
highest_bit_unrolled
也比被接受的SO答案中的解法highest_bit_so
以及另一个答案中给出的解法highest_bit_so2
都要快。
基准测试循环遍历了覆盖连续位的一位掩码。这样做是为了尝试击败展开二进制搜索中的分支预测,这是现实中的情况:在真实世界的程序中,输入用例不太可能表现出位位置的局部性。
以下是在旧的Intel(R) Core(TM)2 Duo CPU E4500 @ 2.20GHz
上的结果:
$ ./highest-bit
highest_bit_unrolled = 6.090s
highest_bit_parallel = 9.260s
highest_bit_bs = 19.910s
highest_bit_shift = 21.130s
highest_bit_so = 8.230s
highest_bit_so2 = 6.960s
在较新的型号 Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz
上:
highest_bit_unrolled = 1.555s
highest_bit_parallel = 3.420s
highest_bit_bs = 6.486s
highest_bit_shift = 9.505s
highest_bit_so = 4.127s
highest_bit_so2 = 1.645s
在更新的硬件上,
highest_bit_so2
更接近于
highest_bit_unrolled
。顺序不完全相同;现在
highest_bit_so
真正落后,比
highest_bit_parallel
慢。
最快的一个,highest_bit_unrolled
包含了最多的代码和最多的分支。每个不同条件下到达的返回值都有自己专用的代码。
“避免所有分支”的直觉(由于担心分支错误预测)并不总是正确的。现代(甚至不那么现代的)处理器中包含了相当多的技巧,以便不受分支的影响。
P.S. highest_bit_unrolled
是在 2011 年 12 月 在 TXR 语言中引入的(存在错误,已经调试)。
最近,我开始想知道是否可以使用一些更简洁的没有分支的代码来提高速度。
结果让我有些惊讶。
可以说,该代码应该真正地使用#ifdef
来适配 GNU C 并使用一些编译器原语,但就可移植性而言,该版本是保留的。
((x << 1) - 1)
的最高有效位。你需要特殊处理x == 0
,并且如果最高位被设置,你会溢出,但这种方法可能比其他一些舍入技术更快。 - tomlogic