将9个字符数字转换为int或unsigned int的最快方法

6
#include <stdio.h>
#include <iostream>
#include <string>
#include <chrono>
#include <memory>
#include <cstdlib>
#include <cstdint>
#include <cstring>
#include <immintrin.h>
using namespace std;

const int p[9] =   {1, 10, 100, 
                    1000, 10000, 100000, 
                    1000000, 10000000, 100000000};
                    
class MyTimer {
 private:
  std::chrono::time_point<std::chrono::steady_clock> starter;

 public:
  void startCounter() {
    starter = std::chrono::steady_clock::now();
  }

  int64_t getCounterNs() {    
    return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - starter).count();
  }
};
                    
int convert1(const char *a) {
    int res = 0;
    for (int i=0; i<9; i++) res = res * 10 + a[i] - 48;
    return res;
}

int convert2(const char *a) {
    return (a[0] - 48) * p[8] + (a[1] - 48) * p[7] + (a[2] - 48) * p[6]
            + (a[3] - 48) * p[5] + (a[4] - 48) * p[4] + (a[5] - 48) * p[3]
            + (a[6] - 48) * p[2] + (a[7] - 48) * p[1] + (a[8] - 48) * p[0];
}

int convert3(const char *a) {
    return (a[0] - 48) * p[8] + a[1] * p[7] + a[2] * p[6] + a[3] * p[5]
            + a[4] * p[4] + a[5] * p[3] + a[6] * p[2] + a[7] * p[1] + a[8]
            - 533333328;
}

const unsigned pu[9] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000,
    100000000};

int convert4u(const char *aa) {
  const unsigned char *a = (const unsigned char*) aa;
  return a[0] * pu[8] + a[1] * pu[7] + a[2] * pu[6] + a[3] * pu[5] + a[4] * pu[4]
      + a[5] * pu[3] + a[6] * pu[2] + a[7] * pu[1] + a[8] - (unsigned) 5333333328u;
}

int convert5(const char* a) {
    int val = 0;
    for(size_t k =0;k <9;++k) {
        val = (val << 3) + (val << 1) + (a[k]-'0');
    }
    return val;
}

const unsigned pu2[9] = {100000000, 10000000, 1000000, 100000, 10000, 1000, 100, 10, 1};

int convert6u(const char *a) {
  return a[0]*pu2[0] + a[1]*pu2[1] + a[2]*pu2[2] + a[3] * pu2[3] + a[4] * pu2[4] + a[5] * pu2[5] + a[6] * pu2[6] + a[7] * pu2[7] + a[8] - (unsigned) 5333333328u;
}

constexpr std::uint64_t zeros(char z) {
    std::uint64_t result = 0;
    for (int i = 0; i < sizeof(result); ++i) {
        result = result*256 + z;
    }
    return result;
}

int convertX(const char *a) {
    constexpr std::uint64_t offset = zeros('0');
    constexpr std::uint64_t o1 = 0xFF00FF00FF00FF00;
    constexpr std::uint64_t o2 = 0xFFFF0000FFFF0000;
    constexpr std::uint64_t o3 = 0xFFFFFFFF00000000;

    std::uint64_t buffer;
    std::memcpy(&buffer, a, sizeof(buffer));
    const auto bytes = buffer - offset;
    const auto b1 = (bytes & o1) >> 8;
    const auto words = (bytes & ~o1) + 10*b1;
    const auto w1 = (words & o2) >> 16;
    const auto dwords = (words & ~o2) + 100*w1;
    const auto d1 = (dwords & o3) >> 32;
    const auto qwords = (dwords & ~o3) + 1000*d1;

    const auto final = 10*static_cast<unsigned>(qwords) + (a[9] - '0');
    return static_cast<int>(final);
}

//########################  ACCEPTED ANSWER
//########################
//########################
typedef struct {             // for output into memory
    alignas(16) unsigned hours;
    unsigned minutes, seconds, nanos;
} hmsn;

void str2hmsn(hmsn *out, const char str[15])  // HHMMSSXXXXXXXXX  15 total, with 9-digit nanoseconds.
{    // 15 not including the terminating 0 (if any) which we don't read
    //hmsn retval;
    __m128i digs = _mm_loadu_si128((const __m128i*)str);
    digs = _mm_sub_epi8( digs, _mm_set1_epi8('0') );
    __m128i hms_x_words = _mm_maddubs_epi16( digs, _mm_set1_epi16( 10U + (1U<<8) ));   // SSSE3  pairs of digits => 10s, 1s places.

    __m128i hms_unpacked = _mm_cvtepu16_epi32(hms_x_words);                           // SSE4.1  hours, minutes, seconds unpack from uint16_t to uint32
    //_mm_storeu_si128((__m128i*)&retval, hms_unpacked);                                  // store first 3 struct members; last to be written separately
    _mm_storeu_si128((__m128i*)out, hms_unpacked);
    // or scalar extract with _mm_cvtsi128_si64 (movq) and shift / movzx

    __m128i xwords = _mm_bsrli_si128(hms_x_words, 6);  // would like to schedule this sooner, so oldest-uop-first starts this critical path shuffle ahead of pmovzx
    // 8 bytes of data, lined up in low 2 dwords, rather than split across high 3
    // could have got here with an 8-byte load that starts here, if we didn't want to get the H,M,S integers cheaply.

    __m128i xdwords = _mm_madd_epi16(xwords, _mm_setr_epi16(100, 1, 100, 1,  0,0,0,0));   // low/high uint32 chunks, discard the 9th x digit.
    uint64_t pair32 = _mm_cvtsi128_si64(xdwords);
    uint32_t msd = 100*100 * (uint32_t)pair32;     // most significant dword was at lower address (in printing order), so low half on little-endian x86.  encourage compilers to use 32-bit operand-size for imul
    uint32_t first8_x = msd + (uint32_t)(pair32 >> 32);
    uint32_t nanos = first8_x * 10 + ((unsigned char)str[14] - '0');   // total*10 + lowest digit
    out->nanos = nanos;
    //retval.nanos = nanos;
    //return retval;

  // returning the struct by value encourages compilers in the wrong direction
  // into not doing separate stores, even when inlining into a function that assigns the whole struct to a pointed-to output
}
hmsn mystruct;

int convertSIMD(const char* a)
{
    str2hmsn(&mystruct, a);
    return mystruct.nanos;
}


//########################
//########################
using ConvertFunc = int(const char*);

volatile int result = 0; // do something with the result of function to prevent unexpected optimization
void benchmark(ConvertFunc converter, string name, int numTest=1000) {
    MyTimer timer;
    const int N = 100000;
    char *a = new char[9*N + 17];
    int64_t runtime = 0;    

    for (int t=1; t<=numTest; t++) {        
        // change something to prevent unexpected optimization
        for (int i=0; i<9*N; i++) a[i] = rand() % 10 + '0'; 

        timer.startCounter();
        for (int i=0; i<9*N; i+= 9) result = converter(a+i);
        runtime += timer.getCounterNs();
    }
    cout << name << ": " << (runtime / (double(numTest) * N)) << "ns average\n";
    delete[] a;
}

int main() {
    benchmark(convert1, "slow");
    benchmark(convert2, "normal");    
    benchmark(convert3, "fast");
    benchmark(convert4u, "unsigned");
    benchmark(convert5, "shifting");
    benchmark(convert6u, "reverse");
    benchmark(convertX, "swar64");
    benchmark(convertSIMD, "manualSIMD");

    return 0;
}

我想找到最快的方法将char a[9]转换为int。完整的问题是将形式为HHMMSSxxxxxxxxx时间戳的char a[15]转换为纳秒,其中在x之后约50个字节被分配并且可以安全读取(但不能写入)。在这个问题中,我们只关心最后的9位数字。

版本1是基本的,版本2、3尝试节省一些计算。我使用-O3标志编译,并将10的幂存储在数组中,因为它会被优化掉(使用Godbolt进行了检查)。

如何使这更快?是的,我知道这听起来像过早的优化,但让我们假设我需要那最后的2-3%提升。

大修改:我已经更改了代码,以减少std::chrono对测量时间的影响。结果非常不同:2700毫秒,810毫秒,670毫秒。在我的i7 8750H笔记本上,gcc 9.3.0使用-O3标志,结果为:355、387、320毫秒。

版本3明显更快,而版本2由于代码大小较慢。但是我们能比版本3做得更好吗?

无效的基准

编辑2:函数可以返回unsigned int而不是int(即

unsigned convert1(char *a);

编辑3:我注意到新代码是一个无效的基准测试,因为convert(a)只被执行了一次。使用原始代码,差异仅约为1%。

编辑4:新基准测试。使用 unsigned(convert4u、convert6u)比使用 int 快3-5%。我将运行一个长时间(10+分钟)的基准测试来看看是否有获胜者。我编辑了代码以使用新的基准测试。它生成大量数据,然后运行转换器函数。

编辑5:结果:4.19, 4.51, 3.82, 3.59, 7.64, 3.72 秒。无符号版本最快。能否仅对9个字节使用 SIMD?如果不行,那么我猜这是最好的解决方案。尽管如此,我仍然希望有一个更疯狂的解决方案。

编辑6:在 AMD Ryzen 4350G 上进行基准测试,gcc 版本为 10.3,编译命令为 gcc -o main main.cpp -std=c++17 -O3 -mavx -mavx2 -march=native

slow: 4.17794ns average
normal: 2.59945ns average
fast: 2.27917ns average
unsigned: 2.43814ns average
shifting: 4.72233ns average
reverse: 2.2274ns average
swar64: 2.17179ns average
manualSIMD: 1.55203ns average

被接受的答案甚至比问题要求的计算更多,可以计算HH/MM/SS/nanosec,因此它比这个基准测试显示的速度更快。


3
你使用了哪些编译器标志?开启了优化吗?我怀疑没有,因为你没有使用“result”,而编译器应该会进行优化,使得根本不会调用“converter”。 - 463035818_is_not_a_number
4
你有没有和 std::from_chars() 进行比较? - einpoklum
2
如果代码使用了unsigned数学,那么(a[0] - 48) * p[8] ... - 533333328可能会变成a[0] * p[8] ... - (unsigned)5333333328u。(希望我常量理解正确) - chux - Reinstate Monica
4
你的基准测试完全无效。编译器(或至少一个编译器)能够清楚地看到你的for (int i=1; i<= 1000000; i++) result = converter(a); 循环,并且理解该函数对于相同的输入始终返回相同的结果。请注意指令mov dword ptr [rip + result], eax在连续重复10次后,直接跳回块的开头以实现1000000次重复执行。实际转换只执行了一次。(并且使用较旧的编译器相比,时间更快了x10倍)。 - n. m.
4
也许你应该尝试使用SIMD实现atoi函数。参考链接:https://dev59.com/O1sW5IYBdhLWcg3wT115 http://0x80.pl/articles/simd-parsing-int-sequences.html - Zuljin
显示剩余36条评论
3个回答

10

是的,正如评论中提到的那样,SIMD是可能的。您可以利用它同时解析字符串的HH、MM和SS部分。

由于您具有100%固定格式,在必要时带有前导0,因此这比如何使用SIMD实现atoi?更容易 - 位值是固定的,我们不需要任何比较/位扫描或pcmpistri来查找洗牌控制掩码或比例因子。另外C#中的SIMD字符串转换为无符号整数解析性能改进有一些好的想法,比如调整位值乘数以避免在最后一步中出现问题(请参见本答案末尾的BMI2版本,该版本也使用了该技巧)。

9个十进制数字是两个输入双字和一个剩余的字节,最好单独抓取。

假设您关心吞吐量(能够与周围代码重叠,或在独立元素的循环中执行)而不是从输入指针和准备好内存中的数据到纳秒整数准备就绪的周期临界路径延迟,SSSE3 SIMD在现代x86上应该非常出色。(如果您想将您的小时,分钟,秒解包成连续的uint32_t元素,例如在一个结构体中,那么使用SSE4.1也会很有用)。与标量相比,它在延迟方面也可能具有竞争力。

有趣的事实是:clang自动向量化您的convert2 / convert3函数,在vpmulld中将其扩展为8倍的dword在YMM寄存器中(2个uops),然后是一系列的shuffle/add操作。

策略是使用pmaddubswpmaddwd水平地乘法和加法对成对数字进行操作,以使每个数字乘以其位值。例如:10和1为一组,然后100和1为双位数的整数对组合。最后从标量中提取最后一对数据:将最高有效部分乘以100 * 100,并添加到最低有效部分。我非常确定任何步骤都不可能导致溢出,只针对实际输入为 '0'..'9' 的情况。这可以运行并编译为我预期的asm,但我没有验证数字结果。

// See also an updated version using RORX as discussed in comments
#include <immintrin.h>

typedef struct {             // for output into memory
    alignas(16) unsigned hours;
    unsigned minutes, seconds, nanos;
} hmsn;

void str2hmsn(hmsn *out, const char str[15])  // HHMMSSXXXXXXXXX  15 total, with 9-digit nanoseconds.
{    // 15 not including the terminating 0 (if any) which we don't read
    //hmsn retval;
    __m128i digs = _mm_loadu_si128((const __m128i*)str);
    digs = _mm_sub_epi8( digs, _mm_set1_epi8('0') );
    __m128i hms_x_words = _mm_maddubs_epi16( digs, _mm_set1_epi16( 10U + (1U<<8) ));   // SSSE3  pairs of digits => 10s, 1s places.

    __m128i hms_unpacked = _mm_cvtepu16_epi32(hms_x_words);                           // SSE4.1  hours, minutes, seconds unpack from uint16_t to uint32
    //_mm_storeu_si128((__m128i*)&retval, hms_unpacked);                                  // store first 3 struct members; last to be written separately
    _mm_storeu_si128((__m128i*)out, hms_unpacked);
    // or scalar extract with _mm_cvtsi128_si64 (movq) and shift / movzx

    __m128i xwords = _mm_bsrli_si128(hms_x_words, 6);  // would like to schedule this sooner, so oldest-uop-first starts this critical path shuffle ahead of pmovzx
    // 8 bytes of data, lined up in low 2 dwords, rather than split across high 3
    // could have got here with an 8-byte load that starts here, if we didn't want to get the H,M,S integers cheaply.

    __m128i xdwords = _mm_madd_epi16(xwords, _mm_setr_epi16(100, 1, 100, 1,  0,0,0,0));   // low/high uint32 chunks, discard the 9th x digit.
    uint64_t pair32 = _mm_cvtsi128_si64(xdwords);
    uint32_t msd = 100*100 * (uint32_t)pair32;     // most significant dword was at lower address (in printing order), so low half on little-endian x86.  encourage compilers to use 32-bit operand-size for imul
    uint32_t first8_x = msd + (uint32_t)(pair32 >> 32);
    uint32_t nanos = first8_x * 10 + ((unsigned char)str[14] - '0');   // total*10 + lowest digit
    out->nanos = nanos;
    //retval.nanos = nanos;
    //return retval;

  // returning the struct by value encourages compilers in the wrong direction
  // into not doing separate stores, even when inlining into a function that assigns the whole struct to a pointed-to output
}

On Godbolt with a test loop that uses asm("" ::"m"(sink): "memory" ) to make the compiler redo the work in a loop. Or a std::atomic_thread_fence(acq_rel) hack that gets MSVC to not optimize away the loop either. On my i7-6700k with GCC 11.1, x86-64 GNU/Linux, energy_performance_preference = performance, I got this to run at one iteration per 5 cycles.

我不知道为什么它不能以每4个周期运行一次;我调整了GCC选项,避免JCC erratum slowdown而不填充,并希望将循环放在4个uop缓存行中。(6个uops,1个uop结束于32B边界,6个uops,2个uops结束于dec/jnz)。性能计数器显示前端“正常”,uops_dispatched_port显示每次迭代所有4个ALU端口的uops均小于4个,最高为port0的3.34。 手动填充早期指令可以将其降至3行总共18个uops,但每次迭代仍然没有从5个周期改善,所以我想前端确实是正常的。

LLVM-MCA 在预测每次迭代 3c 的性能上似乎非常有野心,显然基于 Skylake 的错误模型,其“分派”(前端重命名,我想)宽度为 6。即使使用适当的 4 宽模型 -mcpu=haswell,它也只会预测 4.5c。 (我在 Godbolt 上使用了 asm("# LLVM-MCA-BEGIN") 等宏,并在测试循环中包含了 LLVM-MCA 输出窗口。)它没有完全准确的 uop->port 映射,显然不知道仅在端口 1 上运行缓慢的 LEA,但我不知道是否重要。

吞吐量可能受到查找指令级并行性和跨多个迭代重叠的能力的限制,就像 Understanding the impact of lfence on a loop with two long dependency chains, for increasing lengths 中所述。

测试循环如下:

#include <stdlib.h>
#ifndef __cplusplus
#include <stdalign.h>
#endif
#include <stdint.h>

#if 1 && defined(__GNUC__)
#define LLVM_MCA_BEGIN  asm("# LLVM-MCA-BEGIN")
#define LLVM_MCA_END  asm("# LLVM-MCA-END")
#else
#define LLVM_MCA_BEGIN
#define LLVM_MCA_END
#endif


#if defined(__cplusplus)
    #include <atomic>
    using std::atomic_thread_fence, std::memory_order_acq_rel;
#else
    #include <stdatomic.h>
#endif

unsigned testloop(const char str[15]){
    hmsn sink;
    for (int i=0 ; i<1000000000 ; i++){
        LLVM_MCA_BEGIN;
        str2hmsn(&sink, str);
        // compiler memory barrier 
        // force materializing the result, and forget about the input string being the same
#ifdef __GNUC__
        asm volatile("" ::"m"(sink): "memory");
#else
  //#warning happens to be enough with current MSVC
        atomic_thread_fence(memory_order_acq_rel); // strongest barrier that doesn't require any asm instructions on x86; MSVC defeats signal_fence.
#endif
    }
    LLVM_MCA_END;
    volatile unsigned dummy = sink.hours + sink.nanos;  // make sure both halves are really used, else MSVC optimizes.
    return dummy;
}



int main(int argc, char *argv[])
{
    // performance isn't data-dependent, so just use a handy string.
    // alignas(16) static char str[] = "235959123456789";
    uintptr_t p = (uintptr_t)argv[0];
    p &= -16;
    return testloop((char*)p);   // argv[0] apparently has a cache-line split within 16 bytes on my system, worsening from 5c throughput to 6.12c
}

我按照以下方式编译,将循环压缩到它几乎触及的32字节边界之前结束。请注意,-march=haswell 允许使用AVX编码,可以节省一两个指令。
$ g++ -fno-omit-frame-pointer -fno-stack-protector -falign-loops=16 -O3 -march=haswell foo.c -masm=intel
$ objdump -drwC -Mintel a.out | less

...
0000000000001190 <testloop(char const*)>:
  1190:   55                    push   rbp
  1191:   b9 00 ca 9a 3b        mov    ecx,0x3b9aca00
  1196:   48 89 e5              mov    rbp,rsp
  1199:   c5 f9 6f 25 6f 0e 00 00    vmovdqa xmm4,[rip+0xe6f]        # 2010
  11a1:   c5 f9 6f 15 77 0e 00 00    vmovdqa xmm2, [rip+0xe77]        # 2020  # vector constants hoisted
  11a9:   c5 f9 6f 0d 7f 0e 00 00    vmovdqa xmm1, [rip+0xe7f]        # 2030
  11b1:   66 66 2e 0f 1f 84 00 00 00 00 00      data16 cs nop WORD PTR [rax+rax*1+0x0]
  11bc:   0f 1f 40 00        nop    DWORD PTR [rax+0x0]
### Top of loop is 16-byte aligned here, instead of ending up with 8 byte default
  11c0:   c5 d9 fc 07        vpaddb xmm0,xmm4, [rdi]
  11c4:   c4 e2 79 04 c2     vpmaddubsw xmm0,xmm0,xmm2
  11c9:   c4 e2 79 33 d8     vpmovzxwd xmm3,xmm0
  11ce:   c5 f9 73 d8 06     vpsrldq xmm0,xmm0,0x6
  11d3:   c5 f9 f5 c1        vpmaddwd xmm0,xmm0,xmm1
  11d7:   c5 f9 7f 5d f0     vmovdqa [rbp-0x10],xmm3
  11dc:   c4 e1 f9 7e c0     vmovq  rax,xmm0
  11e1:   69 d0 10 27 00 00  imul   edx,eax,0x2710
  11e7:   48 c1 e8 20        shr    rax,0x20
  11eb:   01 d0              add    eax,edx
  11ed:   8d 14 80           lea    edx,[rax+rax*4]
  11f0:   0f b6 47 0e        movzx  eax,BYTE PTR [rdi+0xe]
  11f4:   8d 44 50 d0        lea    eax,[rax+rdx*2-0x30]
  11f8:   89 45 fc           mov    DWORD PTR [rbp-0x4],eax
  11fb:   ff c9              dec    ecx
  11fd:   75 c1              jne    11c0 <testloop(char const*)+0x30>
  # loop ends 1 byte before it would be a problem for the JCC erratum workaround
  11ff:   8b 45 fc              mov    eax,DWORD PTR [rbp-0x4]

因此,GCC在编写固有函数之前手动制定了我计划的汇编方式,使用尽可能少的指令来优化吞吐量。(Clang在这个循环中更偏向于延迟,使用单独的add 而不是3组件LEA)。

比任何仅解析X的标量版本都要快,并且还要解析HH、MM和SS。虽然clang自动向量化convert3 可能在这方面与它竞争,但当内联时它奇怪地不会这样做。

GCC的标量convert3 每次迭代需要8个周期。clang在循环中的标量convert3 需要7个周期,运行速度为4.0个融合域uop/clock,最大化前端带宽并以每个周期一个imul uop饱和端口1。(这会重新加载每个字节并使用movzx将标量结果存储到堆栈本地。但不触及HHMMSS字节。)

$ taskset -c 3 perf stat --all-user -etask-clock,context-switches,cpu-migrations,page-faults,cycles,instructions,uops_issued.any,uops_executed.thread,idq.mite_uops,idq_uops_not_delivered.cycles_fe_was_ok -r1 ./a.out

 Performance counter stats for './a.out':

          1,221.82 msec task-clock                #    1.000 CPUs utilized          
                 0      context-switches          #    0.000 /sec                   
                 0      cpu-migrations            #    0.000 /sec                   
               105      page-faults               #   85.937 /sec                   
     5,079,784,301      cycles                    #    4.158 GHz                    
    16,002,910,115      instructions              #    3.15  insn per cycle         
    15,004,354,053      uops_issued.any           #   12.280 G/sec                  
    18,003,922,693      uops_executed.thread      #   14.735 G/sec                  
         1,484,567      idq.mite_uops             #    1.215 M/sec                  
     5,079,431,697      idq_uops_not_delivered.cycles_fe_was_ok #    4.157 G/sec                  

       1.222107519 seconds time elapsed

       1.221794000 seconds user
       0.000000000 seconds sys

请注意,这是针对1G迭代的数据,因此5.08G周期意味着平均吞吐量为每次迭代5.08个周期。
去除产生输出中HHMMSS部分的额外工作(vpsrldq、vpmovzxwd和vmovdqa存储),只保留9位整数部分,在Skylake上每个迭代运行时间为4.0个周期。如果在最后没有标量存储,其运行时间为3.5。 (我编辑了GCC的汇编代码输出,以注释该指令,因此我知道它仍然在执行所有工作。)
这里存在一种后端瓶颈的事实(而不是前端)可能是将其与独立工作重叠的好事情。

使用BMI2的备用版本rorx

@aqrit's answer on SIMD string to unsigned int parsing in C# performance improvement inspired a version that allows the remaining high * 2 part to be done as part of an LEA instead of scalar ADD, using that movq strategy instead of pshufd/paddd. After coaxing GCC into emitting RORX to copy-and-extract instead of a braindead 2x vmovq r64, xmm0, that gets us down to 14 front-end uops, down from 16, and unfused domain uops 17 down from 18. (clang deoptimizes to mov+shr). Godbolt

// BMI2 version, compiles to efficient asm with GCC11
void str2hmsn_rorx(hmsn *out, const char str[15])  // HHMMSSXXXXXXXXX  15 total, with 9-digit nanoseconds.
{    // 15 not including the terminating 0 (if any) which we don't read
    __m128i digs = _mm_loadu_si128((const __m128i*)str);
    digs = _mm_sub_epi8( digs, _mm_set1_epi8('0') );
    const __m128i mul1 = _mm_set_epi16(0, 0x010A, 0x0A64, 0x14C8, 0x14C8 /* nanos 7 .. 0 */, 0x010A, 0x010A, 0x010A /* SS, MM, HH */);
    const __m128i mul2 = _mm_set_epi32(0, 0, 0x0001000A, 0x00FA61A8);  // extra scaling for the more-significant half baked in to save an imul

    //__m128i hms_x_words = _mm_maddubs_epi16( digs, _mm_set1_epi16( 10U + (1U<<8) ));   // SSSE3  pairs of digits => 10s, 1s places in printing order.
    __m128i hms_x_words = _mm_maddubs_epi16(mul1, digs);    // mul1 as the unsigned operand (first)

    // or scalar extract with _mm_cvtsi128_si64 (movq) instead of unpack, and shift / movzx
    __m128i hms_unpacked = _mm_cvtepu16_epi32(hms_x_words);             // SSE4.1 pmovxzwd   hours, minutes, seconds unpack from u16 to u32
    _mm_storeu_si128((__m128i*)out, hms_unpacked);

    __m128i xwords = _mm_bsrli_si128(hms_x_words, 6);  // would like to schedule this sooner, so oldest-uop-first starts this critical path shuffle ahead of pmovzx
    // 8 bytes of data, lined up in low 2 dwords, rather than split across high 3
    // could have got here with an 8-byte load that starts here, if we didn't want to get the H,M,S integers cheaply.

//  __m128i xdwords = _mm_madd_epi16(xwords, _mm_setr_epi16(100, 1, 100, 1,  0,0,0,0));   // low/high uint32 chunks, discard the 9th x digit.
    __m128i xdwords = _mm_madd_epi16(xwords, mul2);   // low/high uint32 chunks, without the 9th x digit.
    uint64_t pair32 = _mm_cvtsi128_si64(xdwords);
//  uint32_t msd = 100*100 * (uint32_t)pair32;     // most significant dword was at lower address (in printing order), so low half on little-endian x86.  encourage compilers to use 32-bit operand-size for imul
//  uint32_t first8_x = msd + (uint32_t)(pair32 >> 32);
//  uint32_t nanos = first8_x * 10 + ((unsigned char)str[14] - '0');   // total*10 + lowest digit

    uint32_t msd = 2 * (uint32_t)pair32;     // most significant dword was at lower address (in printing order), so low bits of qword on little-endian x86.
//  uint32_t first8_x = msd + (uint32_t)(pair32 >> 32);
    uint32_t first8_x = msd + (uint32_t)_lrotr(pair32, 32);  // try to get it to use rorx to copy-and-extract the high half
    // FIXME: _rotr64 not available everywhere, but _lrotr is 32-bit on Windows.

    uint32_t nanos = first8_x * 10 + ((unsigned char)str[14] - '0');   // total*10 + lowest digit
    out->nanos = nanos;
}

(_lrotr 需要 GCC11 或更高版本。在 Windows 上它是 32 位旋转。但 _rotr64 并非到处可用。在较早的 GCC 上,寻找不同的内嵌函数或 旋转惯用语 可以使编译器使用 rorx dst, src, 32 而不是 mov+shr。)

嵌入到 Godbolt 链接中的 testloop() 中(可以将常量提升出循环,但强制让工作重复进行),uiCA (https://uica.uops.info/) 预测 Skylake 可以以每次约 3.78 个时钟周期运行它,包括循环底部的 dec/jnz 和结果存储,但没有指针增量。(uiCA 显著比 LLVM-MCA 更准确)

Ice Lake / Rocket Lake 可以以每次 3.14 个时钟周期运行此操作。


2
这是我期望的疯狂解决方案类型,谢谢!我需要哪种书来开始学习这些东西? - Huy Le
1
我只关心x86汇编/内嵌汇编。有没有关于这个主题的实用优化书籍? - Huy Le
4
@HuyLe提到了三个网站:
  • https://agner.org/optimize/,这个网站介绍微架构细节,例如Sandybridge系列处理器的uop缓存是如何工作的。
  • https://www.realworldtech.com/haswell-cpu/,关于Haswell处理器的一些信息。
  • https://uops.info/,这个网站提供了Agner指令表的一个改进版本,采用自动化结果收集,避免了Agner偶尔出现的拼写错误和遗漏。不过,Agner的手动测试已经发现了性能影响,Andreas Abel必须更新uops.info以寻找这些效应。
此外,在https://stackoverflow.com/tags/x86/info中可以找到更多相关内容,以及SO上的[x86]/ [cpu-architecture]标签,如JCC勘误等。
- Peter Cordes
2
@aqrit:谢谢,这实际上允许剩余的 high * 2 部分作为 LEA 的一部分完成,使用 movq 策略而不是 pshufd/paddd。在说服 GCC 发出 RORX 来复制和提取而不是愚蠢的 2x vmovq r64, xmm0 后,这使我们的迭代次数降至每次4.58c(前端 uops 14个,从16个减少,未融合域 uops 17个,从18个减少)。 (clang 反优化为 mov+shr)。https://godbolt.org/z/6oxzY4EWK 将在某个时候更新答案。 - Peter Cordes
2
@HuyLe:或者修复您的VM配置,以通过您编译的指令集扩展。 许多默认禁用很多功能以启用将VM迁移到旧CPU。 - Peter Cordes
显示剩余13条评论

5

备选方案

使用无符号数学来避免int类型溢出带来的未定义行为,并允许将所有的-48操作变成一个常量。

const unsigned p[9] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000,
    100000000};

int convert4u(const char *aa) {
  const unsigned char *a = (const unsigned char*) aa;
  return a[0] * p[8] + a[1] * p[7] + a[2] * p[6] + a[3] * p[5] + a[4] * p[4]
      + a[5] * p[3] + a[6] * p[2] + a[7] * p[1] + a[8] - (unsigned) 5333333328u;
}

也尝试将p[9]a[]一样进行排序。这样或许更易于并行计算。我认为重新排序没有什么不利的地方。
const unsigned p[9] = {100000000, 10000000, ..., 1};

int convert4u(const char *aa) {
  const unsigned char *a = (const unsigned char*) aa;
  return a[0]*p[0] + a[1]*p[1] ... a[1]*p[1] + a[8] - (unsigned) 5333333328u;
}

@HuyLe 或许第二个排序的想法有所帮助?总之,这是一个有趣的问题。希望你能以某种方式挤出几个百分点的改进。 - chux - Reinstate Monica
2
@HuyLe 公平地说,对于今天使用的编译器/实现,顺序并不重要。这也适用于使用字符或数字。这就是这种微小优化的麻烦所在。随着明年处理器/编译器的更新,结果可能会有所不同。 - chux - Reinstate Monica
2
好的,我稍微改了一下基准测试,无符号版本大部分时间比convert3()快1-2%。我认为我需要先设置一个好的基准测试。 - Huy Le
a[i] = rand() 是我的一个错误。即使我的代码假设 a[i] 必须在 '0' 和 '9' 的范围内,否则它会溢出。这个问题已经被解决了,我正在运行一个长时间的基准测试。 - Huy Le
@chux-ReinstateMonica:你可能会对我的回答感兴趣。我尝试了SIMD和Skylake上的AVX,可以在每5个周期内解析整个HHMMSSXX...XX字符串,并将结果存储到内存中。延迟可能更糟糕,但我优化的是吞吐量,因为时间戳通常只是数据。 - Peter Cordes
显示剩余5条评论

3

你并不一定需要使用特殊的SIMD指令来进行并行计算。通过使用64位无符号整数,我们可以并行处理九个字节中的八个,然后在最后单独处理第九个字节。

constexpr std::uint64_t zeros(char z) {
    std::uint64_t result = 0;
    for (int i = 0; i < sizeof(result); ++i) {
        result = result*256 + z;
    }
    return result;
}

unsigned convertX(const char *a) {
    constexpr std::uint64_t offset = zeros('0');
    constexpr std::uint64_t o1 = 0xFF00FF00FF00FF00;
    constexpr std::uint64_t o2 = 0xFFFF0000FFFF0000;
    constexpr std::uint64_t o3 = 0xFFFFFFFF00000000;

    std::uint64_t buffer;
    std::memcpy(&buffer, a, sizeof(buffer));
    const auto bytes = buffer - offset;
    const auto b1 = (bytes & o1) >> 8;
    const auto words = (bytes & ~o1) + 10*b1;
    const auto w1 = (words & o2) >> 16;
    const auto dwords = (words & ~o2) + 100*w1;
    const auto d1 = (dwords & o3) >> 32;
    const auto qwords = (dwords & ~o3) + 1000*d1;

    const auto final = 10*static_cast<unsigned>(qwords) + (a[9] - '0');
    return static_cast<unsigned>(final);
}

我使用 MS Visual C++(64位)进行了测试,这个解决方案的基准时间略高于200毫秒,而其他解决方案的基准时间都在400毫秒左右。这很有道理,因为它使用了“正常”解决方案中一半的乘法和加法指令。

我知道memcpy看起来很浪费,但它避免了未定义的行为和对齐问题。


我刚刚重新进行了基准测试并更新了帖子。手动SIMD解决方案仍然更快(实际上还可以做更多的事情)。 - Huy Le

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接