SSE加载/存储内存交易

Question

SSE加载/存储内存交易

3

在使用SSE指令时，内存-寄存器交互有两种方式：

中间指针：

void f_sse(float *input, float *output, unsigned int n)
{
   _m128 *input_sse = reinterpret_cast<__m128*>(input);//Input intermediate pointer
   _m128 *output_sse = reinterpret_cast<__m128*>(output);//Output intermediate pointer
   _m128 s = _mm_set1_ps(0.1f);
   auto loop_size = n/4; 
   for(auto i=0; i<loop_size; ++i)
      output_sse[i] = _mm_add_ps(input_sse[i], s);
}

明确的获取/存储：

void f_sse(float *input, float *output, unsigned int n)
{
   _m128 input_sse, output_sse, result;
   _m128 s = _mm_set1_ps(0.1f); 
   for(auto i=0; i<n; i+=4)
   {
      input_sse  = _mm_load_ps(input+i);
      result     = _mm_add_ps(input_sse, s);
      _mm_store_ps(output+i, result);
   }
}

这些方法有什么区别，哪种方法在性能方面更好？输入和输出指针由_mm_malloc()对齐。

- gorill

如果第一个示例的赋值操作使用未对齐指令，它将会变慢。_mm_store_ps是对齐存储，不是吗？第一个示例类似于逐元素复制。你能展示反汇编输出吗？ - huseyin tugrul buyukisik

2个回答

1

我用 g++ -O2 编译了你们两个的样例，发现主要区别在于 edx（n）中的值被不同地使用，这导致代码略有不同。

第一个函数：

0000000000000000 <_Z6f_sse2PfS_j>:
   0:   c1 ea 02                shr    $0x2,%edx      # loop_size = n / 4. 
   3:   85 d2                   test   %edx,%edx
   5:   74 2d                   je     34 <_Z6f_sse2PfS_j+0x34>
   7:   83 ea 01                sub    $0x1,%edx
   a:   0f 28 0d 00 00 00 00    movaps 0x0(%rip),%xmm1        # 11 <_Z6f_sse2PfS_j+0x11>
  11:   48 83 c2 01             add    $0x1,%rdx
  15:   31 c0                   xor    %eax,%eax
  17:   48 c1 e2 04             shl    $0x4,%rdx             // Adjust for loop size vs. index. 
  1b:   0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
  20:   0f 28 04 07             movaps (%rdi,%rax,1),%xmm0
  24:   0f 58 c1                addps  %xmm1,%xmm0
  27:   0f 29 04 06             movaps %xmm0,(%rsi,%rax,1)
  2b:   48 83 c0 10             add    $0x10,%rax
  2f:   48 39 d0                cmp    %rdx,%rax
  32:   75 ec                   jne    20 <_Z6f_sse2PfS_j+0x20>
  34:   f3 c3                   repz retq

第二个功能：

0000000000000000 <_Z5f_ssePfS_j>:
   0:   85 d2                   test   %edx,%edx
   2:   74 22                   je     26 <_Z5f_ssePfS_j+0x26>
   4:   0f 28 0d 00 00 00 00    movaps 0x0(%rip),%xmm1        # b <_Z5f_ssePfS_j+0xb>
   b:   31 c0                   xor    %eax,%eax
   d:   31 c9                   xor    %ecx,%ecx
   f:   90                      nop
  10:   0f 28 04 07             movaps (%rdi,%rax,1),%xmm0
  14:   83 c1 04                add    $0x4,%ecx
  17:   0f 58 c1                addps  %xmm1,%xmm0
  1a:   0f 29 04 06             movaps %xmm0,(%rsi,%rax,1)
  1e:   48 83 c0 10             add    $0x10,%rax
  22:   39 ca                   cmp    %ecx,%edx
  24:   77 ea                   ja     10 <_Z5f_ssePfS_j+0x10>
  26:   f3 c3                   repz retq

我也查看了生成的代码，并得出了以下结论：

void f_sse2(float *input, float *output, unsigned int n)
{
    __m128 *end = reinterpret_cast<__m128*>(&input[n]);
   __m128 *input_sse = reinterpret_cast<__m128*>(input);//Input intermediate pointer
   __m128 *output_sse = reinterpret_cast<__m128*>(output);//Output intermediate pointer
   __m128 s = _mm_set1_ps(0.1f);
   while(input_sse < end)
      *output_sse++ = _mm_add_ps(*input_sse++, s);
}

这句话的意思是：“生成以下代码：”，并且保留了HTML标签。

0000000000000000 <_Z6f_sse2PfS_j>:
   0:   89 d2                   mov    %edx,%edx
   2:   48 8d 04 97             lea    (%rdi,%rdx,4),%rax
   6:   48 39 c7                cmp    %rax,%rdi
   9:   73 23                   jae    2e <_Z6f_sse2PfS_j+0x2e>
   b:   0f 28 0d 00 00 00 00    movaps 0x0(%rip),%xmm1        # 12 <_Z6f_sse2PfS_j+0x12>
  12:   66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)
  18:   0f 28 07                movaps (%rdi),%xmm0
  1b:   48 83 c7 10             add    $0x10,%rdi
  1f:   0f 58 c1                addps  %xmm1,%xmm0
  22:   0f 29 06                movaps %xmm0,(%rsi)
  25:   48 83 c6 10             add    $0x10,%rsi
  29:   48 39 f8                cmp    %rdi,%rax
  2c:   77 ea                   ja     18 <_Z6f_sse2PfS_j+0x18>
  2e:   f3 c3                   repz retq

我认为这可能会更加高效一点，但也许没有必要为此进行更改。但是它让我有了15分钟的事情做。

- Mats Petersson

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- hivert · Accepted Answer

使用g++在优化级别O3下编译的内部循环汇编代码（使用objdump -d）为：

20:   0f 28 04 07             movaps (%rdi,%rax,1),%xmm0
24:   0f 58 c1                addps  %xmm1,%xmm0
27:   0f 29 04 06             movaps %xmm0,(%rsi,%rax,1)
2b:   48 83 c0 10             add    $0x10,%rax
2f:   48 39 d0                cmp    %rdx,%rax
32:   75 ec                   jne    20 <_Z5f_ssePfS_j+0x20>

"and"。

10:   0f 28 04 07             movaps (%rdi,%rax,1),%xmm0
14:   83 c1 04                add    $0x4,%ecx
17:   0f 58 c1                addps  %xmm1,%xmm0
1a:   0f 29 04 06             movaps %xmm0,(%rsi,%rax,1)
1e:   48 83 c0 10             add    $0x10,%rax
22:   39 ca                   cmp    %ecx,%edx
24:   77 ea                   ja     10 <_Z5f_ssePfS_j+0x10>

它们非常相似。在第一个g++中，只使用了一个计数器（仅一个add指令）。所以我想这更好。