如何最快地复制整数的位。
例如,
17
-> 10001
复制后:1100000011
请参阅http://graphics.stanford.edu/~seander/bithacks.html#InterleaveTableObvious了解更多方法。Interleave bits the obvious way
(modified from http://graphics.stanford.edu/~seander/bithacks.html)
unsigned int x = 17; unsigned int z = 0; // z gets the resulting Morton Number. for (int i = 0; i < sizeof(x) * CHAR_BIT; i++) // unroll for more speed... { z |= (x & 1U << i) << i | (x & 1U << i) << (i + 1); }
771
,但实际上返回的是2313
。 - Kuaisy
的初始值应该与 x
相同,而不是 x << 1
。这样做(或者完全摆脱 y
并在使用 y
的地方都使用 x
)就可以了。<< 1
已经在循环体中处理过了。 - ShadowRangerz |= (x & 1U << i) * 3 << i;
,循环测试应该是 i < (sizeof(x) * CHAR_BIT + 1) / 2
。当 i
的值最大时,表达式 (x & 1U << i) << (i + 1)
会产生未定义的行为。 - chqrlieuint16_t dup16[256] = { 0, 3, 12, 15, 48, ... };
uint32_t x = 17
uint32_t z = dup16[x & 255] | ((uint32_t)dup16[(x >> 8) & 255] << 16);
对于至少带有-march=haswell
的GCC:
最初的回答:
#include <cstddef>
#include <iomanip>
#include <iostream>
#include <immintrin.h>
std::uint64_t interleave_pdep(std::uint32_t input) {
return _pdep_u64(input, 0x5555555555555555)
| _pdep_u64(input, 0xaaaaaaaaaaaaaaaa);
}
int main()
{
std::uint32_t i;
std::cin >> std::hex;
std::cout << std::hex;
while (std::cin >> i)
std::cout << interleave_pdep(i) << '\n';
}
__builtin_ia32_pdep_di
,请改用官方提供的 _pdep_u64
。 - Marc Glisse_pdep_u64
在此范围内未声明),并且除了一些LLVM头文件之外,/usr/include
下也没有pdep
。文档也相当缺乏。如果您能解决这个问题,我会非常感激。 - Arne Vogelimmintrin.h
或 x86intrin.h
。文档可以在英特尔网站上找到。 - Marc Glisse我认为最快的方法应该是使用查找表。
你可以在编译时计算它。
#include <array>
#include <cstdint>
#include <iostream>
#include <iomanip>
constexpr std::uint64_t dup_bit(bool b)
{
unsigned result = 0;
if (b) result = 0x3;
return result;
}
constexpr std::uint64_t slow_dup_bits(std::uint64_t input)
{
std::uint64_t result = 0;
for(int i = 16 ; i-- ; )
{
result <<= 2;
result |= dup_bit(input & (1u << i));
}
return result;
}
template<std::uint64_t Start, std::uint64_t...Is>
constexpr auto make_dup_table(std::integer_sequence<std::uint64_t, Is...>)
{
return std::array<std::uint64_t, sizeof...(Is)>
{{
slow_dup_bits(Start + Is)...,
}};
}
std::uint64_t dup_bits(std::uint64_t x)
{
static const auto table = make_dup_table<0>(std::make_integer_sequence<std::uint64_t, 65536>());
auto low32 = table[x & 65535];
auto high32 = table[x >> 16];
return (high32 << 32) + low32;
}
int main()
{
for(std::uint64_t i = 0 ; i < 64 ; ++i)
std::cout << std::setw(8) << std::setfill(' ') << i
<< " -> "
<< std::hex << std::setw(64) << std::setfill('0') << dup_bits(i) << '\n';
}
0 -> 0000000000000000000000000000000000000000000000000000000000000000
1 -> 0000000000000000000000000000000000000000000000000000000000000003
2 -> 000000000000000000000000000000000000000000000000000000000000000c
3 -> 000000000000000000000000000000000000000000000000000000000000000f
4 -> 0000000000000000000000000000000000000000000000000000000000000030
5 -> 0000000000000000000000000000000000000000000000000000000000000033
6 -> 000000000000000000000000000000000000000000000000000000000000003c
7 -> 000000000000000000000000000000000000000000000000000000000000003f
8 -> 00000000000000000000000000000000000000000000000000000000000000c0
9 -> 00000000000000000000000000000000000000000000000000000000000000c3
a -> 00000000000000000000000000000000000000000000000000000000000000cc
b -> 00000000000000000000000000000000000000000000000000000000000000cf
c -> 00000000000000000000000000000000000000000000000000000000000000f0
d -> 00000000000000000000000000000000000000000000000000000000000000f3
e -> 00000000000000000000000000000000000000000000000000000000000000fc
f -> 00000000000000000000000000000000000000000000000000000000000000ff
10 -> 0000000000000000000000000000000000000000000000000000000000000300
11 -> 0000000000000000000000000000000000000000000000000000000000000303
12 -> 000000000000000000000000000000000000000000000000000000000000030c
13 -> 000000000000000000000000000000000000000000000000000000000000030f
14 -> 0000000000000000000000000000000000000000000000000000000000000330
15 -> 0000000000000000000000000000000000000000000000000000000000000333
16 -> 000000000000000000000000000000000000000000000000000000000000033c
17 -> 000000000000000000000000000000000000000000000000000000000000033f
18 -> 00000000000000000000000000000000000000000000000000000000000003c0
19 -> 00000000000000000000000000000000000000000000000000000000000003c3
1a -> 00000000000000000000000000000000000000000000000000000000000003cc
1b -> 00000000000000000000000000000000000000000000000000000000000003cf
1c -> 00000000000000000000000000000000000000000000000000000000000003f0
1d -> 00000000000000000000000000000000000000000000000000000000000003f3
1e -> 00000000000000000000000000000000000000000000000000000000000003fc
1f -> 00000000000000000000000000000000000000000000000000000000000003ff
20 -> 0000000000000000000000000000000000000000000000000000000000000c00
21 -> 0000000000000000000000000000000000000000000000000000000000000c03
22 -> 0000000000000000000000000000000000000000000000000000000000000c0c
23 -> 0000000000000000000000000000000000000000000000000000000000000c0f
24 -> 0000000000000000000000000000000000000000000000000000000000000c30
25 -> 0000000000000000000000000000000000000000000000000000000000000c33
26 -> 0000000000000000000000000000000000000000000000000000000000000c3c
27 -> 0000000000000000000000000000000000000000000000000000000000000c3f
28 -> 0000000000000000000000000000000000000000000000000000000000000cc0
29 -> 0000000000000000000000000000000000000000000000000000000000000cc3
2a -> 0000000000000000000000000000000000000000000000000000000000000ccc
2b -> 0000000000000000000000000000000000000000000000000000000000000ccf
2c -> 0000000000000000000000000000000000000000000000000000000000000cf0
2d -> 0000000000000000000000000000000000000000000000000000000000000cf3
2e -> 0000000000000000000000000000000000000000000000000000000000000cfc
2f -> 0000000000000000000000000000000000000000000000000000000000000cff
30 -> 0000000000000000000000000000000000000000000000000000000000000f00
31 -> 0000000000000000000000000000000000000000000000000000000000000f03
32 -> 0000000000000000000000000000000000000000000000000000000000000f0c
33 -> 0000000000000000000000000000000000000000000000000000000000000f0f
34 -> 0000000000000000000000000000000000000000000000000000000000000f30
35 -> 0000000000000000000000000000000000000000000000000000000000000f33
36 -> 0000000000000000000000000000000000000000000000000000000000000f3c
37 -> 0000000000000000000000000000000000000000000000000000000000000f3f
38 -> 0000000000000000000000000000000000000000000000000000000000000fc0
39 -> 0000000000000000000000000000000000000000000000000000000000000fc3
3a -> 0000000000000000000000000000000000000000000000000000000000000fcc
3b -> 0000000000000000000000000000000000000000000000000000000000000fcf
3c -> 0000000000000000000000000000000000000000000000000000000000000ff0
3d -> 0000000000000000000000000000000000000000000000000000000000000ff3
3e -> 0000000000000000000000000000000000000000000000000000000000000ffc
3f -> 0000000000000000000000000000000000000000000000000000000000000fff
一种无需循环或查找表的解决方案。
它适用于4位nibble。首先,nibble被复制并通过乘法进行移位。然后,有趣的位被掩码和复制。然后通过移位和或运算将所有位聚集在一起。
int duplicate4(unsigned char c){
// c is a 4bits nibble.
// Assume c=2#dcba
unsigned long long spread=(1+(1<<9)+(1<<18)+(1<<27));
// to duplicate the nibble
//=2#00001000.00000100.00000010.00000001
unsigned long long ll=c*spread;
// ll = 0dcba000.00dcba00.000dcba0.0000dcba
unsigned long long mask=(1+(1<<10)+(1<<20)+(1<<30));
// to extract bits
//=2#01000000.00010000.00000100.00000001
ll &= mask;
// ll = 0d000000.000c0000.00000b00.0000000a
ll |= (ll<<1) ;
// ll = dd000000.00cc0000.0000bb00.000000aa
ll |= (ll >> 16) ;
// ll = dd000000.00cc0000.dd00bb00.00cc00aa
ll |= (ll >> 8) ;
// ll = dd000000.00cc0000.dd00bb00.ddccbbaa
ll &= 0xff ;
// ll = 00000000.00000000.00000000.ddccbbaa
return ll;
}
int duplicate8(unsigned char c){
return duplicate4(c&0xf) + 256*duplicate4((c&0xf0)>>4);
}
std::bitset
(https://en.cppreference.com/w/cpp/utility/bitset),而这在C中是不可用的。 - Daniel H