根据缓冲区的大小,我期望看到延迟在一些明确定义的集群中。我必须说,我看到了一个清晰的模式,这是一种成功。来自经典文章中图3.10、3.11的顺序读取程序对我来说并不像以前那样好用,可能是因为自2007年以来预取已经大大改进。
奇怪的是,我得到的模式与我的缓存8倍大的缓存一致。所以我的问题是:
我的系统(2014年中期的2核MacBook Pro)具有L1d缓存32k、L2缓存256k、L3缓存3M(共享),体系结构是Intel Haswell。
我看到的模式如下:
Size 1024 Loops 10240000 Cycles 7.60162
Size 2048 Loops 10240000 Cycles 7.14387
Size 4096 Loops 10240000 Cycles 7.74612
Size 8192 Loops 10240000 Cycles 6.93018
Size 16384 Loops 10240000 Cycles 7.32189
Size 32768 Loops 10240000 Cycles 7.84709
Size 65536 Loops 10240000 Cycles 8.32192 # <- L1 cache is 32k so I expect a step here
Size 131072 Loops 10240000 Cycles 7.51579
Size 262144 Loops 10240000 Cycles 9.07455
Size 524288 Loops 10240000 Cycles 16.1824 # <- L1 step is here instead / here I would expect a step associated with L2
Size 1048576 Loops 10240000 Cycles 19.0783
Size 2097152 Loops 10240000 Cycles 11.633
Size 4194304 Loops 10240000 Cycles 23.773 # <- L2 step is here instead
Size 8388608 Loops 10240000 Cycles 24.2754
Size 16777216 Loops 10240000 Cycles 61.0624 # <- L3 step is here, apparently (makes sense, since L3 is shared, that it comes a bit earlier than expected)
Size 33554432 Loops 10240000 Cycles 57.5953
Size 67108864 Loops 10240000 Cycles 44.3678
我承认可能是我无法测量L1缓存,将L2步骤作为L1步骤,但如果我看一下以前的答案,来自32位时代,我看到一个显眼的语句(强调乘以* 4):
test_cache(attempts, cache_sizes[i] * 4, latencies, sizeof(latencies) / sizeof(*latencies));
出于某种原因,回答者在一个4倍于缓存区标称大小的缓冲区中测试了延迟。
我感到困惑-这样做的原因是什么?
下面是我的代码(在macOS和clang上运行,但可以轻松适应其他系统)
mwe.c
#include <stdio.h>
#include "x86intrin.h"
#include <fcntl.h>
#include <unistd.h>
#define N (134217728)
#define START_N (1024)
extern uint64_t access_random_place_to_place_memory_dep(uint64_t *, size_t, size_t);
int main (int argc, char** argv) {
unsigned long long n = N, ta, tb;
unsigned long long int loops = 10240000;
// create buffer of random memory
uint64_t *p = malloc(n);
uint64_t res;
int randomData = open("/dev/urandom", O_RDONLY);
read(randomData, p, n);
// result arrays
double results[64];
size_t memories[64];
int i;
for (int working_memory=START_N; working_memory < n; working_memory <<= 1) {
ta = _rdtsc();
access_random_place_to_place_memory_dep(p, working_memory, loops);
tb = _rdtsc();
memories[i] = working_memory;
results[i] = (double)(tb-ta)/loops;
i++;
}
free(p);
for (int j=0; j<i; j++) {
printf("Size %zu Loops %llu Cycles %g\n", memories[j], loops, results[j]);
}
return res;
}
mwe.s
.intel_syntax
.global _access_random_place_to_place_memory_dep
.section __DATA,__data
.section __TEXT,__text
/*
Access memory randomly, by pointer chasing,
each stride depends on the last read
*/
// (rdi, rsi, rdx) ->
// (rax) pointer to buffer
// (rsi) size of buffer (power of 2)
// (rdx) iterations
// no need to pad stack since no function call
_access_random_place_to_place_memory_dep:
mov rbx, [rdi]
xor rcx, rcx
// will use as AND mask
dec rsi
beginz:
cmp rdx, rcx
jbe endz
inc rcx
and rbx, rsi
lea rbx, [rdi + rbx]
mov r8, [rbx]
add rbx, r8
jmp beginz
endz:
mov rax, rbx
ret
要编译 clang mwe.c mwe.s -o mwe
并使用 ./mwe
运行
编辑(2022/3/23)
感谢纠正!我呈现当前版本及其输出。希望它能帮助别人,因为我自己没有轻易找到这样的代码。
mwe.c
#include <stdlib.h>
#include <stdio.h>
#include "x86intrin.h"
#include <fcntl.h>
#include <unistd.h>
#define N (134217728)
#define START_N (128)
extern uint64_t access_random_place_to_place_memory_dep(uint64_t *, size_t, size_t);
void place_random_shuffle(uint64_t *p, uint64_t max_offset) {
uint64_t max_offset_q = max_offset/8;
// start by writing for each qword its own offset
for (uint64_t i=0; i<max_offset_q; i++) {
p[i] = 8*i;
}
// then shuffle (Fisher Yates shuffling)
for (uint64_t i=0; i<max_offset_q-1; i++) {
uint64_t t;
uint64_t j = (rand() % (max_offset_q-i)) + i;
t = p[i];
p[i] = p[j];
p[j] = t;
}
}
int main (int argc, char** argv) {
unsigned long long n = N, ta, tb;
unsigned long long int loops = 10240000;
// create buffer of random memory
uint64_t *p = malloc(n);
uint64_t res;
// result arrays
double results[64];
size_t memories[64];
int i;
for (int working_memory=START_N; working_memory < n; working_memory <<= 1) {
place_random_shuffle(p, working_memory);
ta = _rdtsc();
res = access_random_place_to_place_memory_dep(p, working_memory, loops);
tb = _rdtsc();
memories[i] = working_memory;
results[i] = (double)(tb-ta)/loops;
i++;
}
free(p);
for (int j=0; j<i; j++) {
printf("Size %zu Loops %llu Cycles %g\n", memories[j], loops, results[j]);
}
return res;
}
mwe.s
.intel_syntax
.global _access_random_place_to_place_memory_dep
.section __DATA,__data
.section __TEXT,__text
/*
Access memory randomly, by pointer chasing,
each stride depends on the last read
*/
// (rdi, rsi, rdx) ->
// (rdi) pointer to buffer
// (rsi) size of buffer
// (rdx) iterations (must be >1)
// no need to pad stack since no function call
_access_random_place_to_place_memory_dep:
xor rax, rax
xor r8, r8
beginz:
mov rax, [rdi+rax]
add r8, rax
dec rdx
jnz beginz
endz:
mov rax, r8
ret
现在的输出效果要好得多了!
Size 128 Loops 10240000 Cycles 4.51077
Size 256 Loops 10240000 Cycles 4.67502
Size 512 Loops 10240000 Cycles 4.46404
Size 1024 Loops 10240000 Cycles 4.47518
Size 2048 Loops 10240000 Cycles 4.67881
Size 4096 Loops 10240000 Cycles 4.5293
Size 8192 Loops 10240000 Cycles 4.36537
Size 16384 Loops 10240000 Cycles 4.56763
Size 32768 Loops 10240000 Cycles 4.59288
Size 65536 Loops 10240000 Cycles 8.66269 # <- L1 step
Size 131072 Loops 10240000 Cycles 9.48717
Size 262144 Loops 10240000 Cycles 15.2417
Size 524288 Loops 10240000 Cycles 27.2223 # <- L2 (?) step
Size 1048576 Loops 10240000 Cycles 47.3154
Size 2097152 Loops 10240000 Cycles 104.716 # <- we start to be out of L3
Size 4194304 Loops 10240000 Cycles 177.333
Size 8388608 Loops 10240000 Cycles 218.087
Size 16777216 Loops 10240000 Cycles 234.883
Size 33554432 Loops 10240000 Cycles 242.916
Size 67108864 Loops 10240000 Cycles 260.416
"循环"实际上是"伪循环",因为有着Turbo Boost技术的存在,但我不知道如何将其考虑在内。
imul eax,eax
链)。如果您测量墙钟时间和TSC滴答声,则可以找到该频率和已知的核心时钟周期。(或者在您的确切CPU型号上搜索TSC频率。例如,Linux进行校准,如我i7-6700k上的“tsc:经过改进的TSC时钟源校准:4008.057 MHz”内核日志) - Peter Cordesperf
或Intel VTune的工具,你也可以让它计算核心周期和参考周期的硬件PMU事件 (cpu_clk_unhalted.ref_tsc
和cpu_clk_unhalted.thread
)。这将直接为你提供给定运行的核心与TSC比率。 - Peter Cordes