我的另一个答案比较长,与这个答案有很大不同,所以我来说点别的。
这个程序并不仅仅过滤掉第一、二个质数的倍数,也不是将所有相关的质数都编码成一个字节。相反,它过滤掉了适合在八位中存储的所有质数的倍数,具体来说就是2到211。因此,它只将大约10%的数字传递给除法运算符,而不是33%。
这些质数被保存在三个SSE寄存器中,它们与运行计数器的模数被保存在另外三个寄存器中。如果任何一个质数与计数器的模数等于零,则计数器不是质数。而且,如果任何模数等于1,那么(counter+2)就不可能是质数,以此类推,直到(counter+30)。偶数被忽略,像+3、+6和+5这样的偏移量也被跳过。向量处理允许同时更新16个字节大小的变量。
经过对其进行了各种微小的优化(但没有比内联指令更具体的平台),我获得了1.78倍的性能提升(在我的笔记本电脑上,24秒与13.4秒相比)。如果使用大数库(即使是非常快的库),优势会更大。请参见下面一个更易读的、进行了预优化的版本。
#include <cmath>
#include <sstream>
#include <iostream>
#include <xmmintrin.h>
using namespace std;
inline void remove_factor( unsigned long &n, unsigned long factor ) __attribute__((always_inline));
void remove_factor( unsigned long &n, unsigned long factor ) {
while ( n % factor == 0 ) {
n /= factor;
cout << factor << " ";
}
}
int main( int argc, char *argv[] ) {
unsigned long n;
if ( argc != 2
|| ( istringstream( argv[1] ) >> n >> ws ).rdstate() != ios::eofbit ) {
cerr << "Usage: " << argv[0] << " <number>\n";
return 1;
}
int primes[] = { 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47,
53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127,
131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211 };
for ( int *p = primes; p < primes + sizeof primes/sizeof *primes; ++ p ) {
remove_factor( n, * p );
}
enum { bias = 15 - 128 };
__m128i const prime1 = _mm_set_epi8( 21, 21, 21, 22, 22, 26, 26, 17, 19, 23, 29, 31, 37, 41, 43, 47 ),
prime2 = _mm_set_epi8( 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127 ),
prime3 = _mm_set_epi8( 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211 ),
vbias = _mm_set1_epi8( bias ),
v3 = _mm_set1_epi8( 3+bias ), v5 = _mm_set1_epi8( 5+bias ), v6 = _mm_set1_epi8( 6+bias ), v8 = _mm_set1_epi8( 8+bias ),
v9 = _mm_set1_epi8( 9+bias ), v11 = _mm_set1_epi8( 11+bias ), v14 = _mm_set1_epi8( 14+bias ), v15 = _mm_set1_epi8( 15+bias );
__m128i mod1 = _mm_add_epi8( _mm_set_epi8( 3, 10, 17, 5, 16, 6, 19, 8, 9, 11, 14, 15, 18, 20, 21, 23 ), vbias ),
mod2 = _mm_add_epi8( _mm_set_epi8( 26, 29, 30, 33, 35, 36, 39, 41, 44, 48, 50, 51, 53, 54, 56, 63 ), vbias ),
mod3 = _mm_add_epi8( _mm_set_epi8( 65, 68, 69, 74, 75, 78, 81, 83, 86, 89, 90, 95, 96, 98, 99, 105 ), vbias );
for ( unsigned long factor = 1, limit = sqrtl( n ); factor <= limit + 30; factor += 30 ) {
if ( n == 1 ) goto done;
unsigned candidates[8], *cand_pen = candidates;
* cand_pen = 6;
cand_pen += !( _mm_movemask_epi8( _mm_cmpeq_epi8( mod1, v3 ) ) | _mm_movemask_epi8( _mm_or_si128( _mm_cmpeq_epi8( mod2, v3 ), _mm_cmpeq_epi8( mod3, v3 ) ) ) );
* cand_pen = 10;
cand_pen += !( _mm_movemask_epi8( _mm_cmpeq_epi8( mod1, v5 ) ) | _mm_movemask_epi8( _mm_or_si128( _mm_cmpeq_epi8( mod2, v5 ), _mm_cmpeq_epi8( mod3, v5 ) ) ) );
* cand_pen = 12;
cand_pen += !( _mm_movemask_epi8( _mm_cmpeq_epi8( mod1, v6 ) ) | _mm_movemask_epi8( _mm_or_si128( _mm_cmpeq_epi8( mod2, v6 ), _mm_cmpeq_epi8( mod3, v6 ) ) ) );
* cand_pen = 16;
cand_pen += !( _mm_movemask_epi8( _mm_cmpeq_epi8( mod1, v8 ) ) | _mm_movemask_epi8( _mm_or_si128( _mm_cmpeq_epi8( mod2, v8 ), _mm_cmpeq_epi8( mod3, v8 ) ) ) );
* cand_pen = 18;
cand_pen += !( _mm_movemask_epi8( _mm_cmpeq_epi8( mod1, v9 ) ) | _mm_movemask_epi8( _mm_or_si128( _mm_cmpeq_epi8( mod2, v9 ), _mm_cmpeq_epi8( mod3, v9 ) ) ) );
* cand_pen = 22;
cand_pen += !( _mm_movemask_epi8( _mm_cmpeq_epi8( mod1, v11 ) ) | _mm_movemask_epi8( _mm_or_si128( _mm_cmpeq_epi8( mod2, v11 ), _mm_cmpeq_epi8( mod3, v11 ) ) ) );
* cand_pen = 28;
cand_pen += !( _mm_movemask_epi8( _mm_cmpeq_epi8( mod1, v14 ) ) | _mm_movemask_epi8( _mm_or_si128( _mm_cmpeq_epi8( mod2, v14 ), _mm_cmpeq_epi8( mod3, v14 ) ) ) );
* cand_pen = 30;
cand_pen += !( _mm_movemask_epi8( _mm_cmpeq_epi8( mod1, v15 ) ) | _mm_movemask_epi8( _mm_or_si128( _mm_cmpeq_epi8( mod2, v15 ), _mm_cmpeq_epi8( mod3, v15 ) ) ) );
mod1 = _mm_sub_epi8( mod1, _mm_set1_epi8( 15 ) );
__m128i mask1 = _mm_cmplt_epi8( mod1, _mm_set1_epi8( 1+bias ) );
mask1 = _mm_and_si128( mask1, prime1 );
mod1 = _mm_add_epi8( mask1, mod1 );
mod2 = _mm_sub_epi8( mod2, _mm_set1_epi8( 15 ) );
__m128i mask2 = _mm_cmplt_epi8( mod2, _mm_set1_epi8( 1+bias ) );
mask2 = _mm_and_si128( mask2, prime2 );
mod2 = _mm_add_epi8( mask2, mod2 );
mod3 = _mm_sub_epi8( mod3, _mm_set1_epi8( 15 ) );
__m128i mask3 = _mm_cmplt_epi8( mod3, _mm_set1_epi8( 1+bias ) );
mask3 = _mm_and_si128( mask3, prime3 );
mod3 = _mm_add_epi8( mask3, mod3 );
if ( cand_pen - candidates == 0 ) continue;
remove_factor( n, factor + candidates[ 0 ] );
if ( cand_pen - candidates == 1 ) continue;
remove_factor( n, factor + candidates[ 1 ] );
if ( cand_pen - candidates == 2 ) continue;
remove_factor( n, factor + candidates[ 2 ] );
if ( cand_pen - candidates == 3 ) continue;
remove_factor( n, factor + candidates[ 3 ] );
if ( cand_pen - candidates == 4 ) continue;
remove_factor( n, factor + candidates[ 4 ] );
if ( cand_pen - candidates == 5 ) continue;
remove_factor( n, factor + candidates[ 5 ] );
if ( cand_pen - candidates == 6 ) continue;
remove_factor( n, factor + candidates[ 6 ] );
}
cout << n;
done:
cout << endl;
}
.
dkrauss$ /usr/local/bin/g++ main.cpp -o factorize_sse -O3 --profile-use
dkrauss$ time ./factorize_sse 18446743721522234449
4294967231 4294967279
real 0m13.437s
user 0m13.393s
sys 0m0.011s
以下是上述内容的第一版草稿。优化包括:
- 使循环计数器合并无条件(避免分支)。
- 通过将循环展开15倍,增加步长到30来获得ILP。
- 受您的优化启发。
- 30似乎是一个甜点,因为它可以免费去除2、3和5的倍数。
- 5到15之间的质数可能在一个步幅内有几个倍数,因此在向量中放置几个不同相位的副本。
- 分解
remove_factor
。
- 将条件不可预测的
remove_factor
调用改为非分支数组写入。
- 完全展开最终循环,并确保函数始终内联。
- 消除最后一个展开迭代,因为候选项中总是有7的倍数。
- 添加另一个包含所有剩余小于某个值的质数的向量。
- 通过向计数器添加偏差并添加另一个向量来增加更多空间。现在只有6个可能被过滤而不需要升级到16位的质数,并且我已经用尽了寄存器:循环需要3个质数向量、3个模数向量、8个常量进行搜索,并且每个常量都需要增加并执行范围检查。这使得总共有16个。
- 在此应用中,收益微小(但为正),但此技术的原始目的是过滤筛子的质数,以用于其他答案。所以请继续关注……
易读版本:
#include <cmath>
#include <sstream>
#include <iostream>
#include <xmmintrin.h>
using namespace std;
int main( int argc, char *argv[] ) {
unsigned long n;
if ( argc != 2
|| ( istringstream( argv[1] ) >> n >> ws ).rdstate() != ios::eofbit ) {
cerr << "Usage: " << argv[0] << " <number>\n";
return 1;
}
int primes[] = { 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59 };
for ( int *p = primes; p < primes + sizeof primes/sizeof *primes; ++ p ) {
while ( n % * p == 0 ) {
n /= * p;
cout << * p << " ";
}
}
if ( n != 1 ) {
__m128i mod = _mm_set_epi8( 1, 2, 3, 5, 6, 8, 9, 11, 14, 15, 18, 20, 21, 23, 26, 29 );
__m128i const prime = _mm_set_epi8( 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59 ),
one = _mm_set1_epi8( 1 );
for ( unsigned long factor = 1, limit = sqrtl( n ); factor < limit; ) {
factor += 2;
__m128i mask = _mm_cmpeq_epi8( mod, one );
mod = _mm_sub_epi8( mod, one );
if ( _mm_movemask_epi8( mask ) ) {
mask = _mm_and_si128( mask, prime );
mod = _mm_or_si128( mask, mod );
} else while ( n % factor == 0 ) {
n /= factor;
cout << factor << " ";
if ( n == 1 ) goto done;
}
}
cout << n;
}
done:
cout << endl;
}
vector
的依赖。用户仍然可以使用vector
,例如GetFactors(n, back_inserter(v))
。 - Jon Purdy