如果您可以重新排列数据以同时处理两对输入向量,则可以使用此代码(仅限SSE2)
// @brief Computes two squared distances between two pairs of 3D vectors
// @param a
// Pointer to the first pair of 3D vectors.
// The two vectors must be stored with stride 24, i.e. (a + 3) should point to the first component of the second vector in the pair.
// Must be aligned by 16 (2 doubles).
// @param b
// Pointer to the second pairs of 3D vectors.
// The two vectors must be stored with stride 24, i.e. (a + 3) should point to the first component of the second vector in the pair.
// Must be aligned by 16 (2 doubles).
// @param c
// Pointer to the output 2 element array.
// Must be aligned by 16 (2 doubles).
// The two distances between a and b vectors will be written to c[0] and c[1] respectively.
void (const double * __restrict__ a, const double * __restrict__ b, double * __restrict c) {
// diff0 = ( a0.y - b0.y, a0.x - b0.x ) = ( d0.y, d0.x )
__m128d diff0 = _mm_sub_pd(_mm_load_pd(a), _mm_load_pd(b));
// diff1 = ( a1.x - b1.x, a0.z - b0.z ) = ( d1.x, d0.z )
__m128d diff1 = _mm_sub_pd(_mm_load_pd(a + 2), _mm_load_pd(b + 2));
// diff2 = ( a1.z - b1.z, a1.y - b1.y ) = ( d1.z, d1.y )
__m128d diff2 = _mm_sub_pd(_mm_load_pd(a + 4), _mm_load_pd(b + 4));
// prod0 = ( d0.y * d0.y, d0.x * d0.x )
__m128d prod0 = _mm_mul_pd(diff0, diff0);
// prod1 = ( d1.x * d1.x, d0.z * d0.z )
__m128d prod1 = _mm_mul_pd(diff1, diff1);
// prod2 = ( d1.z * d1.z, d1.y * d1.y )
__m128d prod2 = _mm_mul_pd(diff1, diff1);
// _mm_unpacklo_pd(prod0, prod2) = ( d1.y * d1.y, d0.x * d0.x )
// psum = ( d1.x * d1.x + d1.y * d1.y, d0.x * d0.x + d0.z * d0.z )
__m128d psum = _mm_add_pd(_mm_unpacklo_pd(prod0, prod2), prod1);
// _mm_unpackhi_pd(prod0, prod2) = ( d1.z * d1.z, d0.y * d0.y )
// dotprod = ( d1.x * d1.x + d1.y * d1.y + d1.z * d1.z, d0.x * d0.x + d0.y * d0.y + d0.z * d0.z )
__m128d dotprod = _mm_add_pd(_mm_unpackhi_pd(prod0, prod2), psum);
__mm_store_pd(c, dotprod);
}