一般的优化是改善缓存局部性。将结果与参数放在一起。使用 powf
而不是 pow
可以防止浮点数和双精度数之间的往返浪费时间。任何合理的编译器都会将 1/a
提出循环,所以这不是问题。
您应该检查自动向量化是否可能被编译器欺骗,因为没有在指向数组的指针上使用显式索引。
typedef struct {
float x,y,z, re;
} element_t;
void heavy_load(element_t * const elements, const int num_elts, const float a) {
element_t * elts = elements;
for (i = 0; i < num_elts; ++ i) {
elts->re =
powf((powf(elts->x, a) + powf(elts->y, a) + powf(elts->z, a)), 1.0/a);
elts ++;
}
}
使用SSE2 SIMD,代码如下所示。我使用并包含了Julien Pommier的中的一部分指数计算代码; 许可证附在下面。
```
使用 SSE2 SIMD,代码如下。我使用并导入 Julien Pommier 的 sse_mathfun.h
中的指数部分;许可证条款在下方附上。
```
#include <emmintrin.h>
#include <math.h>
#include <assert.h>
#ifdef _MSC_VER
# define ALIGN16_BEG __declspec(align(16))
# define ALIGN16_END
#else
# define ALIGN16_BEG
# define ALIGN16_END __attribute__((aligned(16)))
#endif
typedef __m128 v4sf;
typedef __m128i v4si;
#define _PS_CONST(Name, Val) \
static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
#define _PI32_CONST(Name, Val) \
static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
_PS_CONST(1 , 1.0f);
_PS_CONST(0p5, 0.5f);
_PI32_CONST(0x7f, 0x7f);
_PS_CONST(exp_hi, 88.3762626647949f);
_PS_CONST(exp_lo, -88.3762626647949f);
_PS_CONST(cephes_LOG2EF, 1.44269504088896341);
_PS_CONST(cephes_exp_C1, 0.693359375);
_PS_CONST(cephes_exp_C2, -2.12194440e-4);
_PS_CONST(cephes_exp_p0, 1.9875691500E-4);
_PS_CONST(cephes_exp_p1, 1.3981999507E-3);
_PS_CONST(cephes_exp_p2, 8.3334519073E-3);
_PS_CONST(cephes_exp_p3, 4.1665795894E-2);
_PS_CONST(cephes_exp_p4, 1.6666665459E-1);
_PS_CONST(cephes_exp_p5, 5.0000001201E-1);
v4sf exp_ps(v4sf x) {
v4sf tmp = _mm_setzero_ps(), fx;
v4si emm0;
v4sf one = *(v4sf*)_ps_1;
x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
emm0 = _mm_cvttps_epi32(fx);
tmp = _mm_cvtepi32_ps(emm0);
v4sf mask = _mm_cmpgt_ps(tmp, fx);
mask = _mm_and_ps(mask, one);
fx = _mm_sub_ps(tmp, mask);
tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
x = _mm_sub_ps(x, tmp);
x = _mm_sub_ps(x, z);
z = _mm_mul_ps(x,x);
v4sf y = *(v4sf*)_ps_cephes_exp_p0;
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, x);
y = _mm_add_ps(y, one);
emm0 = _mm_cvttps_epi32(fx);
emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
emm0 = _mm_slli_epi32(emm0, 23);
v4sf pow2n = _mm_castsi128_ps(emm0);
y = _mm_mul_ps(y, pow2n);
return y;
}
typedef struct {
float x,y,z, re;
} element_t;
void fast(element_t * const elements, const int num_elts, const float a) {
element_t * elts = elements;
float logf_a = logf(a);
float logf_1_a = logf(1.0/a);
v4sf log_a = _mm_load1_ps(&logf_a);
v4sf log_1_a = _mm_load1_ps(&logf_1_a);
assert(num_elts % 3 == 0);
for (int i = 0; i < num_elts; i += 3) {
v4sf r0 = _mm_load_ps(&elts[0].x);
v4sf r1 = _mm_load_ps(&elts[1].x);
v4sf r2 = _mm_load_ps(&elts[2].x);
v4sf r3 = _mm_setzero_ps();
v4sf t0 = _mm_unpacklo_ps(r0, r1);
v4sf t1 = _mm_unpacklo_ps(r2, r3);
v4sf t2 = _mm_unpackhi_ps(r0, r1);
v4sf t3 = _mm_unpackhi_ps(r2, r3);
r0 = _mm_movelh_ps(t0, t1);
r1 = _mm_movehl_ps(t1, t0);
r2 = _mm_movelh_ps(t2, t3);
v4sf r0a = _mm_mul_ps(r0, log_a);
v4sf r1a = _mm_mul_ps(r1, log_a);
v4sf r2a = _mm_mul_ps(r2, log_a);
v4sf ex0 = exp_ps(r0a);
v4sf ex1 = exp_ps(r1a);
v4sf ex2 = exp_ps(r2a);
v4sf s1 = _mm_add_ps(ex0, ex1);
v4sf s2 = _mm_add_ps(sum1, ex2);
v4sf ps = _mm_mul_ps(s2, log_1_a);
v4sf es = exp_ps(ps);
ALIGN16_BEG float re[4] ALIGN16_END;
_mm_store_ps(re, es);
elts[0].re = re[0];
elts[1].re = re[1];
elts[2].re = re[2];
elts += 3;
}
}
re
模块?通常你可以通过计算re^a
来避免进行指数运算。能否提供更多上下文信息? - templatetypedefa
恰好是整数,那么可能唯一的优化方式就是通过重复因子和在a
== 2 时使用sqrt
来优化内部幂运算。在一般情况下,您将会使用pow
库函数 4 次。 - Genepow()
函数的调用,因为这最后的乘方运算的指数分别为10或5。这样做可以将运行时间缩短约20%。 - cmaster - reinstate monica