#include <stdio.h>
#include <iostream>
#include <string>
#include <chrono>
#include <memory>
#include <cstdlib>
#include <cstdint>
#include <cstring>
#include <immintrin.h>
using namespace std;
const int p[9] = {1, 10, 100,
1000, 10000, 100000,
1000000, 10000000, 100000000};
class MyTimer {
private:
std::chrono::time_point<std::chrono::steady_clock> starter;
public:
void startCounter() {
starter = std::chrono::steady_clock::now();
}
int64_t getCounterNs() {
return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - starter).count();
}
};
int convert1(const char *a) {
int res = 0;
for (int i=0; i<9; i++) res = res * 10 + a[i] - 48;
return res;
}
int convert2(const char *a) {
return (a[0] - 48) * p[8] + (a[1] - 48) * p[7] + (a[2] - 48) * p[6]
+ (a[3] - 48) * p[5] + (a[4] - 48) * p[4] + (a[5] - 48) * p[3]
+ (a[6] - 48) * p[2] + (a[7] - 48) * p[1] + (a[8] - 48) * p[0];
}
int convert3(const char *a) {
return (a[0] - 48) * p[8] + a[1] * p[7] + a[2] * p[6] + a[3] * p[5]
+ a[4] * p[4] + a[5] * p[3] + a[6] * p[2] + a[7] * p[1] + a[8]
- 533333328;
}
const unsigned pu[9] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000,
100000000};
int convert4u(const char *aa) {
const unsigned char *a = (const unsigned char*) aa;
return a[0] * pu[8] + a[1] * pu[7] + a[2] * pu[6] + a[3] * pu[5] + a[4] * pu[4]
+ a[5] * pu[3] + a[6] * pu[2] + a[7] * pu[1] + a[8] - (unsigned) 5333333328u;
}
int convert5(const char* a) {
int val = 0;
for(size_t k =0;k <9;++k) {
val = (val << 3) + (val << 1) + (a[k]-'0');
}
return val;
}
const unsigned pu2[9] = {100000000, 10000000, 1000000, 100000, 10000, 1000, 100, 10, 1};
int convert6u(const char *a) {
return a[0]*pu2[0] + a[1]*pu2[1] + a[2]*pu2[2] + a[3] * pu2[3] + a[4] * pu2[4] + a[5] * pu2[5] + a[6] * pu2[6] + a[7] * pu2[7] + a[8] - (unsigned) 5333333328u;
}
constexpr std::uint64_t zeros(char z) {
std::uint64_t result = 0;
for (int i = 0; i < sizeof(result); ++i) {
result = result*256 + z;
}
return result;
}
int convertX(const char *a) {
constexpr std::uint64_t offset = zeros('0');
constexpr std::uint64_t o1 = 0xFF00FF00FF00FF00;
constexpr std::uint64_t o2 = 0xFFFF0000FFFF0000;
constexpr std::uint64_t o3 = 0xFFFFFFFF00000000;
std::uint64_t buffer;
std::memcpy(&buffer, a, sizeof(buffer));
const auto bytes = buffer - offset;
const auto b1 = (bytes & o1) >> 8;
const auto words = (bytes & ~o1) + 10*b1;
const auto w1 = (words & o2) >> 16;
const auto dwords = (words & ~o2) + 100*w1;
const auto d1 = (dwords & o3) >> 32;
const auto qwords = (dwords & ~o3) + 1000*d1;
const auto final = 10*static_cast<unsigned>(qwords) + (a[9] - '0');
return static_cast<int>(final);
}
//######################## ACCEPTED ANSWER
//########################
//########################
typedef struct { // for output into memory
alignas(16) unsigned hours;
unsigned minutes, seconds, nanos;
} hmsn;
void str2hmsn(hmsn *out, const char str[15]) // HHMMSSXXXXXXXXX 15 total, with 9-digit nanoseconds.
{ // 15 not including the terminating 0 (if any) which we don't read
//hmsn retval;
__m128i digs = _mm_loadu_si128((const __m128i*)str);
digs = _mm_sub_epi8( digs, _mm_set1_epi8('0') );
__m128i hms_x_words = _mm_maddubs_epi16( digs, _mm_set1_epi16( 10U + (1U<<8) )); // SSSE3 pairs of digits => 10s, 1s places.
__m128i hms_unpacked = _mm_cvtepu16_epi32(hms_x_words); // SSE4.1 hours, minutes, seconds unpack from uint16_t to uint32
//_mm_storeu_si128((__m128i*)&retval, hms_unpacked); // store first 3 struct members; last to be written separately
_mm_storeu_si128((__m128i*)out, hms_unpacked);
// or scalar extract with _mm_cvtsi128_si64 (movq) and shift / movzx
__m128i xwords = _mm_bsrli_si128(hms_x_words, 6); // would like to schedule this sooner, so oldest-uop-first starts this critical path shuffle ahead of pmovzx
// 8 bytes of data, lined up in low 2 dwords, rather than split across high 3
// could have got here with an 8-byte load that starts here, if we didn't want to get the H,M,S integers cheaply.
__m128i xdwords = _mm_madd_epi16(xwords, _mm_setr_epi16(100, 1, 100, 1, 0,0,0,0)); // low/high uint32 chunks, discard the 9th x digit.
uint64_t pair32 = _mm_cvtsi128_si64(xdwords);
uint32_t msd = 100*100 * (uint32_t)pair32; // most significant dword was at lower address (in printing order), so low half on little-endian x86. encourage compilers to use 32-bit operand-size for imul
uint32_t first8_x = msd + (uint32_t)(pair32 >> 32);
uint32_t nanos = first8_x * 10 + ((unsigned char)str[14] - '0'); // total*10 + lowest digit
out->nanos = nanos;
//retval.nanos = nanos;
//return retval;
// returning the struct by value encourages compilers in the wrong direction
// into not doing separate stores, even when inlining into a function that assigns the whole struct to a pointed-to output
}
hmsn mystruct;
int convertSIMD(const char* a)
{
str2hmsn(&mystruct, a);
return mystruct.nanos;
}
//########################
//########################
using ConvertFunc = int(const char*);
volatile int result = 0; // do something with the result of function to prevent unexpected optimization
void benchmark(ConvertFunc converter, string name, int numTest=1000) {
MyTimer timer;
const int N = 100000;
char *a = new char[9*N + 17];
int64_t runtime = 0;
for (int t=1; t<=numTest; t++) {
// change something to prevent unexpected optimization
for (int i=0; i<9*N; i++) a[i] = rand() % 10 + '0';
timer.startCounter();
for (int i=0; i<9*N; i+= 9) result = converter(a+i);
runtime += timer.getCounterNs();
}
cout << name << ": " << (runtime / (double(numTest) * N)) << "ns average\n";
delete[] a;
}
int main() {
benchmark(convert1, "slow");
benchmark(convert2, "normal");
benchmark(convert3, "fast");
benchmark(convert4u, "unsigned");
benchmark(convert5, "shifting");
benchmark(convert6u, "reverse");
benchmark(convertX, "swar64");
benchmark(convertSIMD, "manualSIMD");
return 0;
}
我想找到最快的方法将char a[9]
转换为int
。完整的问题是将形式为HHMMSSxxxxxxxxx时间戳的char a[15]
转换为纳秒,其中在x
之后约50个字节被分配并且可以安全读取(但不能写入)。在这个问题中,我们只关心最后的9位数字。
版本1是基本的,版本2、3尝试节省一些计算。我使用-O3标志编译,并将10的幂存储在数组中,因为它会被优化掉(使用Godbolt进行了检查)。
如何使这更快?是的,我知道这听起来像过早的优化,但让我们假设我需要那最后的2-3%提升。
版本3明显更快,而版本2由于代码大小较慢。但是我们能比版本3做得更好吗?
无效的基准编辑2:函数可以返回unsigned int
而不是int
(即
unsigned convert1(char *a);
编辑3:我注意到新代码是一个无效的基准测试,因为convert(a)只被执行了一次。使用原始代码,差异仅约为1%。
编辑4:新基准测试。使用 unsigned(convert4u、convert6u)比使用 int 快3-5%。我将运行一个长时间(10+分钟)的基准测试来看看是否有获胜者。我编辑了代码以使用新的基准测试。它生成大量数据,然后运行转换器函数。
编辑5:结果:4.19, 4.51, 3.82, 3.59, 7.64, 3.72
秒。无符号版本最快。能否仅对9个字节使用 SIMD?如果不行,那么我猜这是最好的解决方案。尽管如此,我仍然希望有一个更疯狂的解决方案。
编辑6:在 AMD Ryzen 4350G 上进行基准测试,gcc 版本为 10.3,编译命令为 gcc -o main main.cpp -std=c++17 -O3 -mavx -mavx2 -march=native
slow: 4.17794ns average
normal: 2.59945ns average
fast: 2.27917ns average
unsigned: 2.43814ns average
shifting: 4.72233ns average
reverse: 2.2274ns average
swar64: 2.17179ns average
manualSIMD: 1.55203ns average
被接受的答案甚至比问题要求的计算更多,可以计算HH/MM/SS/nanosec
,因此它比这个基准测试显示的速度更快。
std::from_chars()
进行比较? - einpoklumunsigned
数学,那么(a[0] - 48) * p[8] ... - 533333328
可能会变成a[0] * p[8] ... - (unsigned)5333333328u
。(希望我常量理解正确) - chux - Reinstate Monicafor (int i=1; i<= 1000000; i++) result = converter(a);
循环,并且理解该函数对于相同的输入始终返回相同的结果。请注意指令mov dword ptr [rip + result], eax
在连续重复10次后,直接跳回块的开头以实现1000000次重复执行。实际转换只执行了一次。(并且使用较旧的编译器相比,时间更快了x10倍)。 - n. m.