如何优化这个计算？(x^a + y^a + z^a)^(1/a)

Question

如何优化这个计算？(x^a + y^a + z^a)^(1/a)

19

正如标题所示，我需要进行很多像这样的计算：

re = (x^a + y^a + z^a)^(1/a).

其中 {x, y, z} >= 0。更具体地说，a 是正浮点常数，x、y、z 是浮点数。符号 ^ 表示乘方运算。

目前，我不想使用 SIMD 加速，但希望使用其他技巧来提高速度。

static void heavy_load(void) {
  static struct xyz_t {
    float x,y,z;
  };
  struct xyz_t xyzs[10000];
  float re[10000] = {.0f};
  const float a = 0.2;

  /* here fill xyzs using some random positive floating point values */

  for (i = 0; i < 10000; ++ i) {
    /* here is what we need to optimize */
    re[i] = pow((pow(xyzs[i].x, a) + pow(xyzs[i].y, a) + pow(xyzs[i].z, a)), 1.0/a);
  }
}

- blackball

7

你是否真的需要了解 re 模块？通常你可以通过计算 re^a 来避免进行指数运算。能否提供更多上下文信息？ - templatetypedef

1

我同意你的问题表述不够清晰。但是，如果 a 恰好是整数，那么可能唯一的优化方式就是通过重复因子和在 a == 2 时使用 sqrt 来优化内部幂运算。在一般情况下，您将会使用 pow 库函数 4 次。 - Gene

3

如果你的a始终是0.1或0.2，你可以通过将最后的乘方运算改为乘法运算来避免一次pow()函数的调用，因为这最后的乘方运算的指数分别为10或5。这样做可以将运行时间缩短约20%。 - cmaster - reinstate monica

1

@KubaOber：如果您启用-ffast-math选项，则gcc将自动为您进行优化。请查看此处的答案（https://dev59.com/Fmw15IYBdhLWcg3wu-M9?rq=1） - phuclv

1

@Kuba，我认为使用SIMD加速这些小代码并不难。这就是为什么我在这个讨论中不喜欢SIMD的原因。 - blackball

显示剩余14条评论

3个回答

4

一般的优化是改善缓存局部性。将结果与参数放在一起。使用 powf 而不是 pow 可以防止浮点数和双精度数之间的往返浪费时间。任何合理的编译器都会将 1/a 提出循环，所以这不是问题。

您应该检查自动向量化是否可能被编译器欺骗，因为没有在指向数组的指针上使用显式索引。

typedef struct {
  float x,y,z, re;
} element_t;

void heavy_load(element_t * const elements, const int num_elts, const float a) {
  element_t * elts = elements;
  for (i = 0; i < num_elts; ++ i) {
    elts->re = 
      powf((powf(elts->x, a) + powf(elts->y, a) + powf(elts->z, a)), 1.0/a);
    elts ++;
  }
}

使用SSE2 SIMD，代码如下所示。我使用并包含了Julien Pommier的中的一部分指数计算代码; 许可证附在下面。

```

使用 SSE2 SIMD，代码如下。我使用并导入 Julien Pommier 的 sse_mathfun.h 中的指数部分；许可证条款在下方附上。

```

#include <emmintrin.h>
#include <math.h>
#include <assert.h>

#ifdef _MSC_VER /* visual c++ */
# define ALIGN16_BEG __declspec(align(16))
# define ALIGN16_END
#else /* gcc or icc */
# define ALIGN16_BEG
# define ALIGN16_END __attribute__((aligned(16)))
#endif

typedef __m128 v4sf;  // vector of 4 float (sse1)
typedef __m128i v4si; // vector of 4 int (sse2)

#define _PS_CONST(Name, Val)                                            \
    static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
#define _PI32_CONST(Name, Val)                                            \
    static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }

_PS_CONST(1  , 1.0f);
_PS_CONST(0p5, 0.5f);
_PI32_CONST(0x7f, 0x7f);
_PS_CONST(exp_hi,   88.3762626647949f);
_PS_CONST(exp_lo,   -88.3762626647949f);
_PS_CONST(cephes_LOG2EF, 1.44269504088896341);
_PS_CONST(cephes_exp_C1, 0.693359375);
_PS_CONST(cephes_exp_C2, -2.12194440e-4);
_PS_CONST(cephes_exp_p0, 1.9875691500E-4);
_PS_CONST(cephes_exp_p1, 1.3981999507E-3);
_PS_CONST(cephes_exp_p2, 8.3334519073E-3);
_PS_CONST(cephes_exp_p3, 4.1665795894E-2);
_PS_CONST(cephes_exp_p4, 1.6666665459E-1);
_PS_CONST(cephes_exp_p5, 5.0000001201E-1);

v4sf exp_ps(v4sf x) {
    v4sf tmp = _mm_setzero_ps(), fx;
    v4si emm0;
    v4sf one = *(v4sf*)_ps_1;

    x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
    x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);

    fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
    fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);

    emm0 = _mm_cvttps_epi32(fx);
    tmp  = _mm_cvtepi32_ps(emm0);

    v4sf mask = _mm_cmpgt_ps(tmp, fx);
    mask = _mm_and_ps(mask, one);
    fx = _mm_sub_ps(tmp, mask);

    tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
    v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
    x = _mm_sub_ps(x, tmp);
    x = _mm_sub_ps(x, z);

    z = _mm_mul_ps(x,x);

    v4sf y = *(v4sf*)_ps_cephes_exp_p0;
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
    y = _mm_mul_ps(y, z);
    y = _mm_add_ps(y, x);
    y = _mm_add_ps(y, one);

    emm0 = _mm_cvttps_epi32(fx);
    emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
    emm0 = _mm_slli_epi32(emm0, 23);
    v4sf pow2n = _mm_castsi128_ps(emm0);
    y = _mm_mul_ps(y, pow2n);
    return y;
}

typedef struct {
    float x,y,z, re;
} element_t;   

void fast(element_t * const elements, const int num_elts, const float a) {
    element_t * elts = elements;
    float logf_a = logf(a);
    float logf_1_a = logf(1.0/a);
    v4sf log_a = _mm_load1_ps(&logf_a);
    v4sf log_1_a = _mm_load1_ps(&logf_1_a);
    assert(num_elts % 3 == 0); // operates on 3 elements at a time

    // elts->re = powf((powf(elts->x, a) + powf(elts->y, a) + powf(elts->z, a)), 1.0/a);
    for (int i = 0; i < num_elts; i += 3) {
        // transpose
        // we save one operation over _MM_TRANSPOSE4_PS by skipping the last row of output
        v4sf r0 = _mm_load_ps(&elts[0].x); // x1,y1,z1,0
        v4sf r1 = _mm_load_ps(&elts[1].x); // x2,y2,z2,0
        v4sf r2 = _mm_load_ps(&elts[2].x); // x3,y3,z3,0
        v4sf r3 = _mm_setzero_ps();        // 0, 0, 0, 0
        v4sf t0 = _mm_unpacklo_ps(r0, r1); //  x1,x2,y1,y2
        v4sf t1 = _mm_unpacklo_ps(r2, r3); //  x3,0, y3,0
        v4sf t2 = _mm_unpackhi_ps(r0, r1); //  z1,z2,0, 0
        v4sf t3 = _mm_unpackhi_ps(r2, r3); //  z3,0, 0, 0
        r0 = _mm_movelh_ps(t0, t1);        // x1,x2,x3,0
        r1 = _mm_movehl_ps(t1, t0);        // y1,y2,y3,0
        r2 = _mm_movelh_ps(t2, t3);        // z1,z2,z3,0
        // perform pow(x,a),.. using the fact that pow(x,a) = exp(x * log(a))
        v4sf r0a = _mm_mul_ps(r0, log_a); // x1*log(a), x2*log(a), x3*log(a), 0
        v4sf r1a = _mm_mul_ps(r1, log_a); // y1*log(a), y2*log(a), y3*log(a), 0
        v4sf r2a = _mm_mul_ps(r2, log_a); // z1*log(a), z2*log(a), z3*log(a), 0
        v4sf ex0 = exp_ps(r0a); // pow(x1, a), ..., 0
        v4sf ex1 = exp_ps(r1a); // pow(y1, a), ..., 0
        v4sf ex2 = exp_ps(r2a); // pow(z1, a), ..., 0
        // sum
        v4sf s1 = _mm_add_ps(ex0, ex1);
        v4sf s2 = _mm_add_ps(sum1, ex2);
        // pow(sum, 1/a) = exp(sum * log(1/a))
        v4sf ps = _mm_mul_ps(s2, log_1_a);
        v4sf es = exp_ps(ps);
        ALIGN16_BEG float re[4] ALIGN16_END;
        _mm_store_ps(re, es);
        elts[0].re = re[0];
        elts[1].re = re[1];
        elts[2].re = re[2];
        elts += 3;
    }
}


/* Copyright (C) 2007  Julien Pommier
  This software is provided 'as-is', without any express or implied
  warranty.  In no event will the authors be held liable for any damages
  arising from the use of this software.
  Permission is granted to anyone to use this software for any purpose,
  including commercial applications, and to alter it and redistribute it
  freely, subject to the following restrictions:
  1. The origin of this software must not be misrepresented; you must not
     claim that you wrote the original software. If you use this software
     in a product, an acknowledgment in the product documentation would be
     appreciated but is not required.
  2. Altered source versions must be plainly marked as such, and must not be
     misrepresented as being the original software.
  3. This notice may not be removed or altered from any source distribution. */

- Kuba hasn't forgotten Monica

2

每当涉及到像 pow() 这样的函数时，缓存一致性只是一个小问题。此外，我真的不认同你应该将参数和输出放在一起的说法 - 虽然这可能有一点帮助，但我认为它同样可能会适得其反。你能用数字支持这个说法吗？ - cmaster - reinstate monica

我基本上同意；这很可能会受到计算能力的限制，因此与内存效率相关的问题相对较小。 - Oliver Charlesworth

3

一些C语言优化-虽然不多，但也有所帮助。

[编辑] 抱歉-回来烦人了：如果FP值变化很大，那么很多时候re = a（或b或c）。进行大量差异测试将取消对某些x、y或z调用pow()的需要。这有助于平均时间，但并不是最坏情况下的时间。

将1.0 / a替换为a_inverse，在循环之前设置。

将pow()替换为powf()，否则会调用double版本的pow()。

小问题：在float re [10000] = {0.0f}中初始化不需要除了刷新内存缓存之外的其他操作。

小问题：使用指针而不是索引数组可能会节省一点时间。

小问题：对于某些平台，使用类型double 可能运行更快-颠倒了我上面关于pow()/powf()的评论。

小问题：尝试使用三个分别为x、y、z的数组。每个都是float *类型。

显然，进行性能分析有所帮助，但让我们假设这在这里是众所周知的。

- chux - Reinstate Monica

同意缓存一致性问题，但我甚至不知道OP的系统是否有缓存（可能已经声明过而我没有注意到）。我认为尝试一下并不是坏建议 - 只是可能不太可能有帮助。分析将是最终的裁决者。 - chux - Reinstate Monica

1

我认为目前市面上没有任何合理的硬件带有浮点运算单元，可以运行C语言且没有缓存。缓存一致性是最基本的要求，因此建议否则是一种过早的悲观主义。你不应该提出这样的建议。它永远都不会有所帮助。在任何当前销售的硬件上，它都会使情况变得更糟。我是认真的。 - Kuba hasn't forgotten Monica

1

@Kuba Ober将嵌入式处理器称为微小的8位微控制器，已经不符合当前流行的16位和32位处理器。OP的问题没有确定平台。有关FP数学性能的问题在PIC以及矢量处理器中经常出现。使用汇编语言编写整个程序仅适用于日益增长的嵌入式市场细分的萎缩部分选项。 - chux - Reinstate Monica

你错过了对SIMD的引用。这是针对PC或类似的“大”平台。
任何32位的东西都可能有缓存，可能是微小的缓存，会被你的多数组悲观主义所破坏。
如果你的嵌入式平台编译器太差，无法处理将1/a提出循环或索引数组与指针+增量的区别，那么是时候换平台了。我早就意识到了这一点。生命太短暂了 :)

- Kuba hasn't forgotten Monica

1

@Kuba Ober，你又错了，因为我没有漏掉参考。OP：“……更喜欢不使用SIMD”。我理解为OP希望出于某种原因避免使用该类并行计算机中可用的该特性。也许是为了使代码在没有该功能的机器上运行更快。 - chux - Reinstate Monica

显示剩余7条评论

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- David Hammen · Accepted Answer

第一步是通过因式分解来消去其中一个指数项。下面的代码使用

(x^a + y^a + z^a)^(1/a) = x * ((1 + (y/x)^a + (z/x)^a)^(1/a))

将三个数中最大的数因式分解可能更加安全，也可能更准确。

另一个优化方法是利用a只能是0.1或0.2这一事实。使用Chebychev多项式逼近x^a。下面的代码仅逼近了x^0.1; x^0.2仅是其平方。最后，由于1/a是一个小整数（5或10），最终的指数运算可以用少量的乘法来代替。

要了解函数powtenth和powtenthnorm中发生的情况，请参见此答案: Optimizations for pow() with const non-integer exponent? .

#include <stdlib.h>
#include <math.h>


float powfive (float x);
float powtenth (float x);
float powtenthnorm (float x);

// Returns (x^0.2 + y^0.2 + z^0.2)^5
float pnormfifth (float x, float y, float z) {
   float u = powtenth(y/x);
   float v = powtenth(z/x);
   return x * powfive (1.0f + u*u + v*v);
}

// Returns (x^0.1 + y^0.1 + z^0.1)^10
float pnormtenth (float x, float y, float z) {
   float u = powtenth(y/x);
   float v = powtenth(z/x);
   float w = powfive (1.0f + u + v);
   return x * w * w;
}

// Returns x^5
float powfive (float x) {
   float xsq = x*x;
   return xsq*xsq*x;
}

// Returns x^0.1.
float powtenth (float x) {
   static const float pow2_tenth[10] = {
      1.0,
      pow(2.0, 0.1),
      pow(4.0, 0.1),
      pow(8.0, 0.1),
      pow(16.0, 0.1),
      pow(32.0, 0.1),
      pow(64.0, 0.1),
      pow(128.0, 0.1),
      pow(256.0, 0.1),
      pow(512.0, 0.1)
   };

   float s;
   int iexp;

   s = frexpf (x, &iexp);
   s *= 2.0;
   iexp -= 1;

   div_t qr = div (iexp, 10);
   if (qr.rem < 0) {
      qr.quot -= 1;
      qr.rem += 10;
   }

   return ldexpf (powtenthnorm(s)*pow2_tenth[qr.rem], qr.quot);
}

// Returns x^0.1 for x in [1,2), to within 1.2e-7 (relative error).
// Want more precision? Add more Chebychev polynomial coefs.
float powtenthnorm (float x) {
   static const int N = 8;

   // Chebychev polynomial terms.
   // Non-zero terms calculated via
   //   integrate (2/pi)*ChebyshevT[n,u]/sqrt(1-u^2)*((u+3)/2)^0.1
   //   from -1 to 1
   // Zeroth term is similar except it uses 1/pi rather than 2/pi.
   static const float Cn[N] = {
       1.0386703502389972,
       3.55833786872637476e-2,
      -2.7458105122604368629e-3,
       2.9828558990819401155e-4,
      -3.70977182883591745389e-5,
       4.96412322412154169407e-6,
      -6.9550743747592849e-7,
       1.00572368333558264698e-7};

   float Tn[N];

   float u = 2.0*x - 3.0;


   Tn[0] = 1.0;
   Tn[1] = u;
   for (int ii = 2; ii < N; ++ii) {
      Tn[ii] = 2*u*Tn[ii-1] - Tn[ii-2];
   }

   float y = 0.0;
   for (int ii = N-1; ii > 0; --ii) {
      y += Cn[ii]*Tn[ii];
   }

   return y + Cn[0];
}