除非您的CPU支持XOP,否则没有有效的方法可以比较无符号64位整数。
以下内容摘自Agner Fog的向量类库。这显示了如何比较无符号64位整数。
static inline Vec2qb operator > (Vec2uq const & a, Vec2uq const & b) {
#ifdef __XOP__
return Vec2q(_mm_comgt_epu64(a,b));
#else
__m128i sign32 = _mm_set1_epi32(0x80000000);
__m128i aflip = _mm_xor_si128(a,sign32);
__m128i bflip = _mm_xor_si128(b,sign32);
__m128i equal = _mm_cmpeq_epi32(a,b);
__m128i bigger = _mm_cmpgt_epi32(aflip,bflip);
__m128i biggerl = _mm_shuffle_epi32(bigger,0xA0);
__m128i eqbig = _mm_and_si128(equal,biggerl);
__m128i hibig = _mm_or_si128(bigger,eqbig);
__m128i big = _mm_shuffle_epi32(hibig,0xF5);
return Vec2qb(Vec2q(big));
#endif
}
如果你的CPU支持XOP,那么你应该尝试使用-mxop
编译,并查看循环是否向量化。
编辑:如果GCC不能按照你的要求向量化,而你的CPU有XOP,则可以执行以下操作
for (WorkerID=0; WorkerID<WorkersON-1; workerID+=2){
__m128i v = _mm_loadu_si128((__m128i*)&WorkerDataTime[workerID]);
__m128i cmp = _mm_comgt_epu64(v, _mm_setzero_si128());
v = _mm_add_epi64(v,cmp);
_mm_storeu_si128((__m128i*)&WorkerDataTime[workerID], v);
}
for (;WorkerID<WorkersON;++WorkerID){
if(WorkerDataTime[WorkerID] > 0) WorkerDataTime[WorkerID]-=1;
}
使用-mxop
编译,并包含#include <x86intrin.h>
。
编辑:如Nils Pipenbrinck所指出,如果您没有XOP,可以使用一个额外的指令_mm_xor_si128
来完成此操作:
for (WorkerID=0; WorkerID<WorkersON-1; WorkerID+=2){
__m128i v = _mm_loadu_si128((__m128i*)&WorkerDataTime[workerID]);
__m128i mask = _mm_cmpeq_epi64(v,_mm_setzero_si128());
mask = _mm_xor_si128(mask, _mm_set1_epi32(~0));
v= _mm_add_epi64(v,mask);
_mm_storeu_si128((__m128i*)&WorkerDataTime[workerID], v);
}
for (;WorkerID<WorkersON;++WorkerID){
if(WorkerDataTime[WorkerID] > 0) WorkerDataTime[WorkerID]-=1;
}
编辑:
根据Stephen Canon的评论,我了解到使用SSE4.2中的pcmpgtq
指令可以更有效地比较一般的64位无符号整数:
__m128i a,b
__m128i sign64 = _mm_set1_epi64x(0x8000000000000000L)
__m128i aflip = _mm_xor_si128(a, sign64)
__m128i bflip = _mm_xor_si128(b, sign64)
__m128i cmp = _mm_cmpgt_epi64(aflip,bflip)