代码行
next += val;
性能下降到了10倍,我已经检查过汇编代码,但没有结果。
为什么这行代码会导致性能下降到10倍?
以下是结果:
➜ ~ clang-13 1.c -O3
➜ ~ ./a.out
rand_read_1
sum = 2624b18779c40, time = 0.19s
rand_read_2
sum = 2624b18779c40, time = 1.24s
中央处理器(CPU):Intel(R) Xeon(R) Silver 4210 CPU @ 2.20GHz
以下是代码:
#include <stdio.h>
#include <time.h>
#include <stdint.h>
#include <unistd.h>
#include <string.h>
#include <assert.h>
#include <stdlib.h>
#define CCR_MULTIPLY_64 6364136223846793005
#define CCR_ADD_64 1
static inline uint64_t my_rand64(uint64_t *r)
{
*r = *r * CCR_MULTIPLY_64 + CCR_ADD_64;
return *r;
}
#define NUM 10000000UL
uint64_t rand_read_1(uint64_t *ptr, uint64_t nr_words)
{
uint64_t i, next, val = 0;
uint64_t sum;
next = 0;
sum = 0;
for (i = 0; i < NUM; i++) {
my_rand64(&next);
next %= nr_words;
val = ptr[next];
sum += val ^ next;
// printf("next1:%ld\n", next);
}
return sum;
}
uint64_t rand_read_2(uint64_t *ptr, uint64_t nr_words)
{
uint64_t i, next, val ,next2 = 0;
uint64_t sum;
next = 0;
sum = 0;
for (i = 0; i < NUM; i++) {
my_rand64(&next);
next %= nr_words;
val = ptr[next];
sum += val ^ next;
next += val;
}
return sum;
}
#define SIZE (1024*1024*1024)
static uint64_t get_ns(void)
{
struct timespec val;
uint64_t v;
int ret;
ret = clock_gettime(CLOCK_REALTIME, &val);
if (ret != 0) {
perror("clock_gettime");
exit(1);
}
v = (uint64_t) val.tv_sec * 1000000000LL;
v += (uint64_t) val.tv_nsec;
return v;
}
int main(int argc, char *argv[])
{
uint64_t *ptr;
uint64_t sum;
uint64_t t0, t1, td, t2;
ptr = (uint64_t *)malloc(SIZE);
assert(ptr);
memset(ptr, 0, SIZE);
t0 = get_ns();
printf("rand_read_1\n");
sum = rand_read_1(ptr, SIZE/8);
t1 = get_ns();
td = t1 - t0;
printf("sum = %lx, time = %.2fs\n", sum, td/1E9);
printf("rand_read_2\n");
sum = rand_read_2(ptr, SIZE/8);
t2 = get_ns();
td = t2 - t1;
printf("sum = %lx, time = %.2fs\n", sum, td/1E9);
return 0;
}
rand_read
调用会发生什么? - Lundinmalloc(SIZE);
...memset(ptr, 0, SIZE);
翻译为:使用calloc
。或者更好的方法是使用volatile
访问将整个块默认初始化为一些随机的 goo,然后再将其清零。这样可以确保操作系统的后期堆分配本身不会影响基准测试。 - Lundin