测量缓存延迟

34

我想用C语言测量L1、L2和L3缓存的延迟。我知道它们的大小,也理解如何进行操作,但是在我的实现中遇到了问题。我在想是否有其他硬件细节(例如预取)导致了问题。

#include <time.h>
#include <stdio.h>
#include <string.h>

int main(){
    srand(time(NULL));  // Seed ONCE
    const int L1_CACHE_SIZE =  32768/sizeof(int);
    const int L2_CACHE_SIZE =  262144/sizeof(int);
    const int L3_CACHE_SIZE =  6587392/sizeof(int);
    const int NUM_ACCESSES = 1000000;
    const int SECONDS_PER_NS = 1000000000;
    int arrayAccess[L1_CACHE_SIZE];
    int arrayInvalidateL1[L1_CACHE_SIZE];
    int arrayInvalidateL2[L2_CACHE_SIZE];
    int arrayInvalidateL3[L3_CACHE_SIZE];
    int count=0;
    int index=0;
    int i=0;
    struct timespec startAccess, endAccess;
    double mainMemAccess, L1Access, L2Access, L3Access;
    int readValue=0;

    memset(arrayAccess, 0, L1_CACHE_SIZE*sizeof(int));
    memset(arrayInvalidateL1, 0, L1_CACHE_SIZE*sizeof(int));
    memset(arrayInvalidateL2, 0, L2_CACHE_SIZE*sizeof(int));
    memset(arrayInvalidateL3, 0, L3_CACHE_SIZE*sizeof(int));

    index = 0;
    clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
    while (index < L1_CACHE_SIZE) {
        int tmp = arrayAccess[index];               //Access Value from L2
        index = (index + tmp + ((index & 4) ? 28 : 36));   // on average this should give 32 element skips, with changing strides
        count++;                                           //divide overall time by this 
    }
    clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
    mainMemAccess = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
    mainMemAccess /= count;

    printf("Main Memory Access %lf\n", mainMemAccess);

    index = 0;
    count=0;
    clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
    while (index < L1_CACHE_SIZE) {
        int tmp = arrayAccess[index];               //Access Value from L2
        index = (index + tmp + ((index & 4) ? 28 : 36));   // on average this should give 32 element skips, with changing strides
        count++;                                           //divide overall time by this 
    }
    clock_gettime(CLOCK_REALTIME, &endAccess); //end clock              
    L1Access = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
    L1Access /= count;

    printf("L1 Cache Access %lf\n", L1Access);

    //invalidate L1 by accessing all elements of array which is larger than cache
    for(count=0; count < L1_CACHE_SIZE; count++){
        int read = arrayInvalidateL1[count]; 
        read++;
        readValue+=read;               
    }

    index = 0;
    count = 0;
    clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
    while (index < L1_CACHE_SIZE) {
        int tmp = arrayAccess[index];               //Access Value from L2
        index = (index + tmp + ((index & 4) ? 28 : 36));   // on average this should give 32 element skips, with changing strides
        count++;                                           //divide overall time by this 
    }
    clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
    L2Access = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
    L2Access /= count;

    printf("L2 Cache Acces %lf\n", L2Access);

    //invalidate L2 by accessing all elements of array which is larger than cache
    for(count=0; count < L2_CACHE_SIZE; count++){
        int read = arrayInvalidateL2[count];  
        read++;
        readValue+=read;                        
    }

    index = 0;
    count=0;
    clock_gettime(CLOCK_REALTIME, &startAccess); //sreadValue+=read;tart clock
    while (index < L1_CACHE_SIZE) {
        int tmp = arrayAccess[index];               //Access Value from L2
        index = (index + tmp + ((index & 4) ? 28 : 36));   // on average this should give 32 element skips, with changing strides
        count++;                                           //divide overall time by this 
    }
    clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
    L3Access = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
    L3Access /= count;

    printf("L3 Cache Access %lf\n", L3Access);

    printf("Read Value: %d", readValue);

}

我先通过访问数组中的值来获取所需数据,这显然应该来自主内存,因为这是第一次访问。这个数组很小(小于页大小),所以它应该被复制到L1、L2、L3。我从相同大小的数组中访问值来使我想要访问的数据无效化(现在应该只在L2/3中)。然后我重复这个过程用于L2和L3。然而,访问时间明显不对,这意味着我做错了什么......

我认为可能存在时钟(开始和停止将花费一些时间(ns),并且它们将在缓存/非缓存时更改)方面的问题。

有人能给我一些指针,告诉我可能做错了什么吗?

UPDATE1:所以我通过进行大量访问来分摊计时器的成本,我固定了我的缓存大小,我还采纳了建议,使用更复杂的索引方案来避免固定步幅。不幸的是,时间仍然不正确。它们都似乎来自L1。我认为问题可能出在使数据失效而不是访问上。随机与LRU方案是否会影响被使无效的数据?

UPDATE2:修复了memset(添加了L3 memset以使L3中的数据无效,使第一次访问从主内存开始),还有索引方案,但仍然没有成功。

UPDATE3:我从来没能让这种方法起作用,但是有一些很好的建议答案,我也发布了自己的一些解决方案。

我还运行了Cachegrind来查看命中/错过。

 ==6710== I   refs:      1,735,104
==6710== I1  misses:        1,092
==6710== LLi misses:        1,084
==6710== I1  miss rate:      0.06%
==6710== LLi miss rate:      0.06%
==6710== 
==6710== D   refs:      1,250,696  (721,162 rd   + 529,534 wr)
==6710== D1  misses:      116,492  (  7,627 rd   + 108,865 wr)
==6710== LLd misses:      115,102  (  6,414 rd   + 108,688 wr)
==6710== D1  miss rate:       9.3% (    1.0%     +    20.5%  )
==6710== LLd miss rate:       9.2% (    0.8%     +    20.5%  )
==6710== 
==6710== LL refs:         117,584  (  8,719 rd   + 108,865 wr)
==6710== LL misses:       116,186  (  7,498 rd   + 108,688 wr)
==6710== LL miss rate:        3.8% (    0.3%     +    20.5%  )


        Ir I1mr ILmr      Dr  D1mr  DLmr     Dw D1mw DLmw 

      .    .    .       .     .     .      .    .    .  #include <time.h>
      .    .    .       .     .     .      .    .    .  #include <stdio.h>
      .    .    .       .     .     .      .    .    .  #include <string.h>
      .    .    .       .     .     .      .    .    .  
      6    0    0       0     0     0      2    0    0  int main(){
      5    1    1       0     0     0      2    0    0      srand(time(NULL));  // Seed ONCE
      1    0    0       0     0     0      1    0    0      const int L1_CACHE_SIZE =  32768/sizeof(int);
      1    0    0       0     0     0      1    0    0      const int L2_CACHE_SIZE =  262144/sizeof(int);
      1    0    0       0     0     0      1    0    0      const int L3_CACHE_SIZE =  6587392/sizeof(int);
      1    0    0       0     0     0      1    0    0      const int NUM_ACCESSES = 1000000;
      1    0    0       0     0     0      1    0    0      const int SECONDS_PER_NS = 1000000000;
     21    2    2       3     0     0      3    0    0      int arrayAccess[L1_CACHE_SIZE];
     21    1    1       3     0     0      3    0    0      int arrayInvalidateL1[L1_CACHE_SIZE];
     21    2    2       3     0     0      3    0    0      int arrayInvalidateL2[L2_CACHE_SIZE];
     21    1    1       3     0     0      3    0    0      int arrayInvalidateL3[L3_CACHE_SIZE];
      1    0    0       0     0     0      1    0    0      int count=0;
      1    1    1       0     0     0      1    0    0      int index=0;
      1    0    0       0     0     0      1    0    0      int i=0;
      .    .    .       .     .     .      .    .    .      struct timespec startAccess, endAccess;
      .    .    .       .     .     .      .    .    .      double mainMemAccess, L1Access, L2Access, L3Access;
      1    0    0       0     0     0      1    0    0      int readValue=0;
      .    .    .       .     .     .      .    .    .  
      7    0    0       2     0     0      1    1    1      memset(arrayAccess, 0, L1_CACHE_SIZE*sizeof(int));
      7    1    1       2     2     0      1    0    0      memset(arrayInvalidateL1, 0, L1_CACHE_SIZE*sizeof(int));
      7    0    0       2     2     0      1    0    0      memset(arrayInvalidateL2, 0, L2_CACHE_SIZE*sizeof(int));
      7    1    1       2     2     0      1    0    0      memset(arrayInvalidateL3, 0, L3_CACHE_SIZE*sizeof(int));
      .    .    .       .     .     .      .    .    .  
      1    0    0       0     0     0      1    1    1      index = 0;
      4    0    0       0     0     0      1    0    0      clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
    772    1    1     514     0     0      0    0    0      while (index < L1_CACHE_SIZE) {
  1,280    1    1     768   257   257    256    0    0          int tmp = arrayAccess[index];               //Access Value from L2
  2,688    0    0     768     0     0    256    0    0          index = (index + tmp + ((index & 4) ? 28 : 36));   // on average this should give 32 element skips, with changing strides
    256    0    0     256     0     0      0    0    0          count++;                                           //divide overall time by this 
      .    .    .       .     .     .      .    .    .      }
      4    0    0       0     0     0      1    0    0      clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
     14    1    1       5     1     1      1    1    1      mainMemAccess = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
      6    0    0       2     0     0      1    0    0      mainMemAccess /= count;
      .    .    .       .     .     .      .    .    .  
      6    1    1       2     0     0      2    0    0      printf("Main Memory Access %lf\n", mainMemAccess);
      .    .    .       .     .     .      .    .    .  
      1    0    0       0     0     0      1    0    0      index = 0;
      1    0    0       0     0     0      1    0    0      count=0;
      4    1    1       0     0     0      1    0    0      clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
    772    1    1     514     0     0      0    0    0      while (index < L1_CACHE_SIZE) {
  1,280    0    0     768   240     0    256    0    0          int tmp = arrayAccess[index];               //Access Value from L2
  2,688    0    0     768     0     0    256    0    0          index = (index + tmp + ((index & 4) ? 28 : 36));   // on average this should give 32 element skips, with changing strides
    256    0    0     256     0     0      0    0    0          count++;                                           //divide overall time by this 
      .    .    .       .     .     .      .    .    .      }
      4    0    0       0     0     0      1    0    0      clock_gettime(CLOCK_REALTIME, &endAccess); //end clock              
     14    1    1       5     0     0      1    1    0      L1Access = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
      6    1    1       2     0     0      1    0    0      L1Access /= count;
      .    .    .       .     .     .      .    .    .  
      6    0    0       2     0     0      2    0    0      printf("L1 Cache Access %lf\n", L1Access);
      .    .    .       .     .     .      .    .    .  
      .    .    .       .     .     .      .    .    .      //invalidate L1 by accessing all elements of array which is larger than cache
 32,773    1    1  24,578     0     0      1    0    0      for(count=0; count < L1_CACHE_SIZE; count++){
 40,960    0    0  24,576   513   513  8,192    0    0          int read = arrayInvalidateL1[count]; 
  8,192    0    0   8,192     0     0      0    0    0          read++;
 16,384    0    0  16,384     0     0      0    0    0          readValue+=read;               
      .    .    .       .     .     .      .    .    .      }
      .    .    .       .     .     .      .    .    .  
      1    0    0       0     0     0      1    0    0      index = 0;
      1    1    1       0     0     0      1    0    0      count = 0;
      4    0    0       0     0     0      1    1    0      clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
    772    1    1     514     0     0      0    0    0      while (index < L1_CACHE_SIZE) {
  1,280    0    0     768   256     0    256    0    0          int tmp = arrayAccess[index];               //Access Value from L2
  2,688    0    0     768     0     0    256    0    0          index = (index + tmp + ((index & 4) ? 28 : 36));   // on average this should give 32 element skips, with changing strides
    256    0    0     256     0     0      0    0    0          count++;                                           //divide overall time by this 
      .    .    .       .     .     .      .    .    .      }
      4    1    1       0     0     0      1    0    0      clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
     14    0    0       5     1     0      1    1    0      L2Access = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
      6    1    1       2     0     0      1    0    0      L2Access /= count;
      .    .    .       .     .     .      .    .    .  
      6    0    0       2     0     0      2    0    0      printf("L2 Cache Acces %lf\n", L2Access);
      .    .    .       .     .     .      .    .    .  
      .    .    .       .     .     .      .    .    .      //invalidate L2 by accessing all elements of array which is larger than cache
262,149    2    2 196,610     0     0      1    0    0      for(count=0; count < L2_CACHE_SIZE; count++){
327,680    0    0 196,608 4,097 4,095 65,536    0    0          int read = arrayInvalidateL2[count];  
 65,536    0    0  65,536     0     0      0    0    0          read++;
131,072    0    0 131,072     0     0      0    0    0          readValue+=read;                        
      .    .    .       .     .     .      .    .    .      }
      .    .    .       .     .     .      .    .    .  
      1    0    0       0     0     0      1    0    0      index = 0;
      1    0    0       0     0     0      1    0    0      count=0;
      4    0    0       0     0     0      1    1    0      clock_gettime(CLOCK_REALTIME, &startAccess); //sreadValue+=read;tart clock
    772    1    1     514     0     0      0    0    0      while (index < L1_CACHE_SIZE) {
  1,280    0    0     768   256     0    256    0    0          int tmp = arrayAccess[index];               //Access Value from L2
  2,688    0    0     768     0     0    256    0    0          index = (index + tmp + ((index & 4) ? 28 : 36));   // on average this should give 32 element skips, with changing strides
    256    0    0     256     0     0      0    0    0          count++;                                           //divide overall time by this 
      .    .    .       .     .     .      .    .    .      }
      4    0    0       0     0     0      1    0    0      clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
     14    1    1       5     1     0      1    1    0      L3Access = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
      6    0    0       2     0     0      1    0    0      L3Access /= count;
      .    .    .       .     .     .      .    .    .  
      6    1    1       2     0     0      2    0    0      printf("L3 Cache Access %lf\n", L3Access);
      .    .    .       .     .     .      .    .    .  
      6    0    0       1     0     0      1    0    0      printf("Read Value: %d", readValue);
      .    .    .       .     .     .      .    .    .  
      3    0    0       3     0     0      0    0    0  }

使用rdtsc代替clock_gettime,参见:clock_gettime()适用于亚微秒定时吗? - HMVC
在整个大局中,这不应该造成太大的影响,因为我将开销分散到了广泛的访问中。 - PandaRaid
L1可以从英特尔开发者手册中得到答案。我非常确定里面说L1访问的性能与寄存器访问完全相同。硬件预取器所做正确的事情与它无望搞砸的事情永远不会停止让我惊讶。 - camelccc
你使用的是什么处理器架构? - nmichaels
2
PandaRaid,Cachegrind并不是真实的缓存,它只是缓存模拟器,并且其缓存与CPU的真实缓存及其路线/失效方案并不完全匹配。使用perf stat获取总的真实命中/失效计数,并使用perf record获取一些有关指令失效的信息。 - osgx
5个回答

33

我更愿意尝试使用硬件时钟作为度量标准。rdtsc指令将告诉您自CPU上电以来的当前周期计数。此外,最好使用asm确保在测量和干燥运行中始终使用相同的指令。使用这种方法和一些聪明的统计数据,我很久以前就做到了这一点:

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <sys/mman.h>


int i386_cpuid_caches (size_t * data_caches) {
    int i;
    int num_data_caches = 0;
    for (i = 0; i < 32; i++) {

        // Variables to hold the contents of the 4 i386 legacy registers
        uint32_t eax, ebx, ecx, edx; 

        eax = 4; // get cache info
        ecx = i; // cache id

        asm (
            "cpuid" // call i386 cpuid instruction
            : "+a" (eax) // contains the cpuid command code, 4 for cache query
            , "=b" (ebx)
            , "+c" (ecx) // contains the cache id
            , "=d" (edx)
        ); // generates output in 4 registers eax, ebx, ecx and edx 

        // taken from http://download.intel.com/products/processor/manual/325462.pdf Vol. 2A 3-149
        int cache_type = eax & 0x1F; 

        if (cache_type == 0) // end of valid cache identifiers
            break;

        char * cache_type_string;
        switch (cache_type) {
            case 1: cache_type_string = "Data Cache"; break;
            case 2: cache_type_string = "Instruction Cache"; break;
            case 3: cache_type_string = "Unified Cache"; break;
            default: cache_type_string = "Unknown Type Cache"; break;
        }

        int cache_level = (eax >>= 5) & 0x7;

        int cache_is_self_initializing = (eax >>= 3) & 0x1; // does not need SW initialization
        int cache_is_fully_associative = (eax >>= 1) & 0x1;


        // taken from http://download.intel.com/products/processor/manual/325462.pdf 3-166 Vol. 2A
        // ebx contains 3 integers of 10, 10 and 12 bits respectively
        unsigned int cache_sets = ecx + 1;
        unsigned int cache_coherency_line_size = (ebx & 0xFFF) + 1;
        unsigned int cache_physical_line_partitions = ((ebx >>= 12) & 0x3FF) + 1;
        unsigned int cache_ways_of_associativity = ((ebx >>= 10) & 0x3FF) + 1;

        // Total cache size is the product
        size_t cache_total_size = cache_ways_of_associativity * cache_physical_line_partitions * cache_coherency_line_size * cache_sets;

        if (cache_type == 1 || cache_type == 3) {
            data_caches[num_data_caches++] = cache_total_size;
        }

        printf(
            "Cache ID %d:\n"
            "- Level: %d\n"
            "- Type: %s\n"
            "- Sets: %d\n"
            "- System Coherency Line Size: %d bytes\n"
            "- Physical Line partitions: %d\n"
            "- Ways of associativity: %d\n"
            "- Total Size: %zu bytes (%zu kb)\n"
            "- Is fully associative: %s\n"
            "- Is Self Initializing: %s\n"
            "\n"
            , i
            , cache_level
            , cache_type_string
            , cache_sets
            , cache_coherency_line_size
            , cache_physical_line_partitions
            , cache_ways_of_associativity
            , cache_total_size, cache_total_size >> 10
            , cache_is_fully_associative ? "true" : "false"
            , cache_is_self_initializing ? "true" : "false"
        );
    }

    return num_data_caches;
}

int test_cache(size_t attempts, size_t lower_cache_size, int * latencies, size_t max_latency) {
    int fd = open("/dev/urandom", O_RDONLY);
    if (fd < 0) {
        perror("open");
        abort();
    }
    char * random_data = mmap(
          NULL
        , lower_cache_size
        , PROT_READ | PROT_WRITE
        , MAP_PRIVATE | MAP_ANON // | MAP_POPULATE
        , -1
        , 0
        ); // get some random data
    if (random_data == MAP_FAILED) {
        perror("mmap");
        abort();
    }

    size_t i;
    for (i = 0; i < lower_cache_size; i += sysconf(_SC_PAGESIZE)) {
        random_data[i] = 1;
    }


    int64_t random_offset = 0;
    while (attempts--) {
        // use processor clock timer for exact measurement
        random_offset += rand();
        random_offset %= lower_cache_size;
        int32_t cycles_used, edx, temp1, temp2;
        asm (
            "mfence\n\t"        // memory fence
            "rdtsc\n\t"         // get cpu cycle count
            "mov %%edx, %2\n\t"
            "mov %%eax, %3\n\t"
            "mfence\n\t"        // memory fence
            "mov %4, %%al\n\t"  // load data
            "mfence\n\t"
            "rdtsc\n\t"
            "sub %2, %%edx\n\t" // substract cycle count
            "sbb %3, %%eax"     // substract cycle count
            : "=a" (cycles_used)
            , "=d" (edx)
            , "=r" (temp1)
            , "=r" (temp2)
            : "m" (random_data[random_offset])
            );
        // printf("%d\n", cycles_used);
        if (cycles_used < max_latency)
            latencies[cycles_used]++;
        else 
            latencies[max_latency - 1]++;
    }

    munmap(random_data, lower_cache_size);

    return 0;
} 

int main() {
    size_t cache_sizes[32];
    int num_data_caches = i386_cpuid_caches(cache_sizes);

    int latencies[0x400];
    memset(latencies, 0, sizeof(latencies));

    int empty_cycles = 0;

    int i;
    int attempts = 1000000;
    for (i = 0; i < attempts; i++) { // measure how much overhead we have for counting cyscles
        int32_t cycles_used, edx, temp1, temp2;
        asm (
            "mfence\n\t"        // memory fence
            "rdtsc\n\t"         // get cpu cycle count
            "mov %%edx, %2\n\t"
            "mov %%eax, %3\n\t"
            "mfence\n\t"        // memory fence
            "mfence\n\t"
            "rdtsc\n\t"
            "sub %2, %%edx\n\t" // substract cycle count
            "sbb %3, %%eax"     // substract cycle count
            : "=a" (cycles_used)
            , "=d" (edx)
            , "=r" (temp1)
            , "=r" (temp2)
            :
            );
        if (cycles_used < sizeof(latencies) / sizeof(*latencies))
            latencies[cycles_used]++;
        else 
            latencies[sizeof(latencies) / sizeof(*latencies) - 1]++;

    }

    {
        int j;
        size_t sum = 0;
        for (j = 0; j < sizeof(latencies) / sizeof(*latencies); j++) {
            sum += latencies[j];
        }
        size_t sum2 = 0;
        for (j = 0; j < sizeof(latencies) / sizeof(*latencies); j++) {
            sum2 += latencies[j];
            if (sum2 >= sum * .75) {
                empty_cycles = j;
                fprintf(stderr, "Empty counting takes %d cycles\n", empty_cycles);
                break;
            }
        }
    }

    for (i = 0; i < num_data_caches; i++) {
        test_cache(attempts, cache_sizes[i] * 4, latencies, sizeof(latencies) / sizeof(*latencies));

        int j;
        size_t sum = 0;
        for (j = 0; j < sizeof(latencies) / sizeof(*latencies); j++) {
            sum += latencies[j];
        }
        size_t sum2 = 0;
        for (j = 0; j < sizeof(latencies) / sizeof(*latencies); j++) {
            sum2 += latencies[j];
            if (sum2 >= sum * .75) {
                fprintf(stderr, "Cache ID %i has latency %d cycles\n", i, j - empty_cycles);
                break;
            }
        }

    }

    return 0;

}

我的 Core2Duo 上的输出结果:

Cache ID 0:
- Level: 1
- Type: Data Cache
- Total Size: 32768 bytes (32 kb)

Cache ID 1:
- Level: 1
- Type: Instruction Cache
- Total Size: 32768 bytes (32 kb)

Cache ID 2:
- Level: 2
- Type: Unified Cache
- Total Size: 262144 bytes (256 kb)

Cache ID 3:
- Level: 3
- Type: Unified Cache
- Total Size: 3145728 bytes (3072 kb)

Empty counting takes 90 cycles
Cache ID 0 has latency 6 cycles
Cache ID 2 has latency 21 cycles
Cache ID 3 has latency 168 cycles

2
请问您能否写下编译过程?我遇到了“错误:'asm' 操作数具有不可能的约束条件”的问题。 - Leeor
在Core2上,L1的延迟应该是3个周期,L2的延迟应该是15个周期;对于Nehalem - L1为4个周期,L2为11个周期,L3为39个周期-http://www.anandtech.com/show/2542/5-根据CPU-Z测试-有工具的Windows二进制文件http://www.cpuid.com/medias/files/softwares/misc/latency.zip。而对于AMD,L2的典型延迟为12-20个周期-http://www.anandtech.com/show/2139/3,并且类似的测试lat_mem_rd包含在lmbench中https://dev59.com/KHjZa4cB1Zd3GeqPkvwF。 - osgx
@Leeor,抱歉回复晚了,我在度假。你使用的是哪个编译器和目标系统?我可以使用clang 5.0gcc 4.8icc 14.0.1为x86_64通用目标编译此代码而不出现错误。尝试更新你的编译器。 - Sergey L.
@Leeor 它说哪一行?你使用了其他编译器标志吗? - Sergey L.
4
对我来说,它导致了段错误。我发现我需要在asm块中用"=&a", "=&d", "=&r"替换"=a", "=d", "=r"才能正确编译。这些&符号告诉gcc不要假设它可以将输出寄存器重新用作输入;在读取所有输入之前,输出寄存器可能已被修改。 - q.t.f.
显示剩余6条评论

11

经典的缓存延迟测试方法之一是遍历链表。它可以在现代超标量/超流水线CPU上运行,甚至可以在像ARM Cortex-A9+和英特尔Core 2/ix这样的乱序核心上运行。这种方法被开源的lmbench使用,在测试lat_mem_rdman页面)和CPU-Z延迟测量工具中使用:http://cpuid.com/medias/files/softwares/misc/latency.zip(Windows本机二进制文件)。

在lmbench的源代码库中可以找到lat_mem_rd测试的代码:https://github.com/foss-for-synopsys-dwc-arc-processors/lmbench/blob/master/src/lat_mem_rd.c

这就是主要的测试内容。

#define ONE p = (char **)*p;
#define FIVE    ONE ONE ONE ONE ONE
#define TEN FIVE FIVE
#define FIFTY   TEN TEN TEN TEN TEN
#define HUNDRED FIFTY FIFTY

void
benchmark_loads(iter_t iterations, void *cookie)
{
    struct mem_state* state = (struct mem_state*)cookie;
    register char **p = (char**)state->p[0];
    register size_t i;
    register size_t count = state->len / (state->line * 100) + 1;

    while (iterations-- > 0) {
        for (i = 0; i < count; ++i) {
            HUNDRED;
        }
    }

    use_pointer((void *)p);
    state->p[0] = (char*)p;
}

所以,在解密宏之后,我们进行了很多线性操作,例如:

 p = (char**) *p;  // (in intel syntax) == mov eax, [eax]
 p = (char**) *p;
 p = (char**) *p;
 ....   // 100 times total
 p = (char**) *p;

在内存中,有很多指针,每个指针都指向前stride个元素。

正如http://www.bitmover.com/lmbench/lat_mem_rd.8.html中所述:

基准测试运行为两个嵌套循环。外部循环是步幅大小,内部循环是数组大小。对于每个数组大小,基准测试会创建一个指向向前一个步幅的指针环。遍历数组通过以下方式进行:

 p = (char **)*p;
在一个for循环中(for循环的开销不重要;该循环是一个展开的1000个loads的循环)。 在执行一百万个loads之后,循环停止。 数组的大小从512字节到(通常为)八兆字节不等。 对于较小的尺寸,高速缓存将发挥作用,并且loads会更快。 当数据绘制出来时,这变得更加明显。
在IBM的wiki上提供了关于POWER的更详细描述和示例:Untangling memory access measurements - lat_mem_rd - 由Jenifer Hopper 2013年撰写。 lat_mem_rd测试(http://www.bitmover.com/lmbench/lat_mem_rd.8.html)需要两个参数:数组大小(以MB为单位)和步长。 基准测试使用两个循环遍历数组,使用步长作为增量,通过创建指向前进一个步长的指针环来使用。 该测试以纳秒为单位测量存储器读取延迟时间,适用于各种存储器大小。 输出由两列组成:第一列是数组大小(浮点值),第二列是整个数组所有点的加载延迟时间。 当结果绘制出来时,您可以清楚地看到整个内存层次结构的相对延迟情况,包括每个缓存级别的快速延迟和主要内存延迟。
PS:Intel有一篇论文(感谢Eldar Abusalimov)提供了运行lat_mem_rd的示例:ftp://download.intel.com/design/intarch/PAPERS/321074.pdf - 抱歉正确的网址是http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-cache-latency-bandwidth-paper.pdf,“测量缓存和内存延迟以及CPU到内存带宽-用于英特尔架构”的Joshua Ruggiero撰写,发表于2008年12月。

最新的PDF链接为https://www.element14.com/community/servlet/JiveServlet/previewBody/20602-102-1-62048/White%20Paper%20How%20to%20Properly%20Measure%20Cache%20Latency,%20Memory%20Latency,%20and%20CPU%20to%20Memory%20Bandwidth%20on%20Intel%C2%AE%20Architecture.pdf - “测量缓存和内存延迟以及CPU到内存带宽” - “适用于英特尔®架构” - 2008年。 - osgx
最新 PDF 链接为 http://www.csit-sun.pub.ro/~cpop/Documentatie_SMP/Intel_Microprocessor_Systems/Intel_ProcessorNew/Intel%20White%20Paper/Measuring%20Cache%20and%20Memory%20Latency%20and%20CPU%20to%20Memory%20Bandwidth.pdf。 - blaze9
你好,我想知道将数据存储到主内存所需的时间(当所有缓存未命中时)。你觉得这个时间是否与从主内存加载所需的时间相等?后者是由lat_mem_rd程序报告的,所以我已经知道了。 - blaze9
1
blaze9,是的,存储到内存的时间应该接近(但不总是相等)于从内存读取的时间。由于使用的写入策略(https://people.cs.pitt.edu/~xianeizhang/notes/cache.html#cache-write https://en.wikipedia.org/wiki/Cache_(computing)#WRITEPOLICIES),它可能会稍微长一些;完整的缓存行写入是独立的,并且可以通过并行化加快速度。对于RAM,由于DRAM的工作原理,存在数十个CPU时钟和50-100纳秒的延迟 - https://www.7-cpu.com/cpu/Haswell.html或https://www.7-cpu.com/cpu/Skylake.html。您可以提出更详细的问题。 - osgx

9

好的,你的代码有几个问题:

  1. As you mentioned, your measurement are taking a long time. In fact, they're very likely to take way longer than the single access itself, so they're not measuring anything useful. To mitigate that, access multiple elements, and amortize (divide the overall time by the number of accesses. Note that to measure latency, you want these accesses to be serialized, otherwise they can be performed in parallel and you'll only measure the throughput of unrelated accesses. To achieve that you could just add a false dependency between accesses.

    For e.g., initialize the array to zeros, and do:

    clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
    for (int i = 0; i < NUM_ACCESSES; ++i) {
        int tmp = arrayAccess[index];                             //Access Value from Main Memory
        index = (index + i + tmp) & 1023;   
    }
    clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
    

    .. and of course remember to divide the time by NUM_ACCESSES.
    Now, i've made the index intentionally complicated so that you avoid a fixed stride which might trigger a prefetcher (a bit of an overkill, you're not likely to notice an impact, but for the sake of demonstration...). You could probably settle for a simple index += 32, which would give you strides of 128k (two cache lines), and avoid the "benefit" of most simple adjacent line/ simple stream prefetchers. I've also replaced the % 1000 with & 1023 since & is faster, but it needs to be power of 2 to work the same way - so just increase ACCESS_SIZE to 1024 and it should work.

  2. Invalidating the L1 by loading something else is good, but the sizes look funny. You didn't specify your system but 256000 seems pretty big for an L1. An L2 is usually 256k on many common modern x86 CPUs for e.g. Also note that 256k is not 256000, but rather 256*1024=262144. Same goes for the second size: 1M is not 1024000, it's 1024*1024=1048576. Assuming that's indeed your L2 size (more likely an L3, but probably too small for that).

  3. Your invalidating arrays are of type int, so each element is longer than a single byte (most likely 4 byte, depending on system). You're actually invalidating L1_CACHE_SIZE*sizeof(int) worth of bytes (and the same goes for the L2 invalidation loop)

更新:

  1. memset receives the size in bytes, your sizes are divided by sizeof(int)

  2. Your invalidation reads are never used, and may be optimized out. Try to accumulate the reads in some value and print it in the end, to avoid this possibility.

  3. The memset at the beginning is accessing the data as well, therefor your first loop is accessing data from the L3 (since the other 2 memsets were still effective in evicting it from L1+L2, although only partially due to the size error.

  4. The strides may be too small so you get two access to the same cacheline (L1 hit). Make sure they're spread enough by adding 32 elements (x4 bytes) - that's 2 cacheline, so you also won't get any adjacent cacheline prefetch benefits.

  5. Since NUM_ACCESSES is larger than ACCESS_SIZE, you're essentially repeating the same elements and would probably get L1 hits for them (so the avg time shifts in favor of L1 access latency). Instead try using the L1 size so you access the entire L1 (except for the skips) exactly once. For e.g. like this -

    index = 0;
    while (index < L1_CACHE_SIZE) {
        int tmp = arrayAccess[index];               //Access Value from L2
        index = (index + tmp + ((index & 4) ? 28 : 36));   // on average this should give 32 element skips, with changing strides
        count++;                                           //divide overall time by this 
    }
    

不要忘记将arrayAccess增加到L1大小。

现在,通过以上更改(多或少),我得到了以下结果:

L1 Cache Access 7.812500
L2 Cache Acces 15.625000
L3 Cache Access 23.437500

这段文字看起来有点冗长,可能是因为它还包含了一个算术操作的额外依赖。


@PandaRaid,那是哪个系统? - Leeor
极致i7,我可能错了,因为我没有从英特尔的网站上阅读实际规格,但这些数字是我从“dmidecode -t cache”命令中得到的。 - PandaRaid
奇怪,我不认为i7会有如此不同的口味,以至于L1 / L2与主流不同,我只期望L3可以调整以获得高/低端偏差。我猜你用的是Linux - /proc/cpuinfo显示了什么? - Leeor
cpuinfo 中的缓存大小似乎只报告了与 dmidecode 输出匹配的 L3 大小。我同意 L1/L2 看起来相当大(特别是 L1,因为它在数据和指令缓存之间有 512k)。 - PandaRaid
嗯。/sys/devices/system/cpu/cpu0/cache/index1/size 显示什么? - Leeor
显示剩余9条评论

1

虽然不是答案,但请仍然阅读,因为其他答案和评论已经提到了一些内容

就在前几天,我回答了这个问题:

它涉及到测量 L1/L2/.../L?/MEMORY 传输速率,请查看它以获得更好的起点解决你的问题

[注]

  1. 我强烈推荐使用RDTSC指令进行时间测量

    尤其是对于L1,因为其他方法太慢了。不要忘记将进程亲和力设置为单个CPU,因为所有核心都有自己的计数器,即使在相同输入时也会有很大的差异。

    调整CPU时钟以适应可变时钟的计算机,并且不要忘记考虑RDTSC溢出,如果您仅使用32位部分(现代CPU每秒会溢出32位计数器)。对于时间计算,请使用CPU时钟(测量它或使用注册表值)

    t0 <- RDTSC
    Sleep(250);
    t1 <- RDTSC
    CPU f=(t1-t0)<<2 [Hz]
    
  2. 将进程亲和力设置为单个CPU

    所有CPU内核通常都有自己的L1, L2缓存,所以在多任务操作系统上,如果您不这样做,则可能会测量到令人困惑的事情

  3. 进行图形输出(图表)

    然后您将可以看到我发布的链接中实际发生的事情,我发布了很多图表

  4. 使用操作系统提供的最高进程优先级


你确定时钟计数器在核之间不同吗?现在,在具有动态频率变化的CPU时代,tsc不再是CPU时钟(请查看https://dev59.com/cGIj5IYBdhLWcg3wuXeN#19942784),而是统一的相干时间时钟,它从一些高频和稳定的信号开始计数,接近典型的CPU频率。当我们使用具有最高实际CPU时钟的RDTSC时,如果其时钟也是可变的,则会得到不正确的缓存延迟结果。 - osgx
我最后在AMD Phenon X3上看到它,频率稳定。我的结论是由于不同的温度(如果所有核心都有自己的PLL)或者核心没有同时设置引起的。我还没有在更新的CPU上测试过(始终使用亲和力1来测量时间线程)。 - Spektre

0

对于那些感兴趣的人,我无法让我的第一个代码集起作用,所以我尝试了几种替代方法,产生了不错的结果。

第一种方法使用链接列表,其中节点在连续的内存空间中分配为步幅字节。节点的取消引用减轻了预取器的效果,在多个缓存行被拉入的情况下,我使步幅显着增大以避免缓存命中。随着分配的列表大小的增加,它跳转到将其保存的缓存或内存结构,显示出明显的延迟分割。

#include <time.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>

//MACROS
#define ONE iterate = (char**) *iterate;
#define FIVE ONE ONE ONE
#define TWOFIVE FIVE FIVE FIVE FIVE FIVE
#define HUNDO TWOFIVE TWOFIVE TWOFIVE TWOFIVE

//prototype
void allocateRandomArray(long double);
void accessArray(char *, long double, char**);

int main(){
    //call the function for allocating arrays of increasing size in MB
    allocateRandomArray(.00049);
    allocateRandomArray(.00098);
    allocateRandomArray(.00195);
    allocateRandomArray(.00293);
    allocateRandomArray(.00391);
    allocateRandomArray(.00586);
    allocateRandomArray(.00781);
    allocateRandomArray(.01172);
    allocateRandomArray(.01562);
    allocateRandomArray(.02344);
    allocateRandomArray(.03125);
    allocateRandomArray(.04688);
    allocateRandomArray(.0625);
    allocateRandomArray(.09375);
    allocateRandomArray(.125);
    allocateRandomArray(.1875);
    allocateRandomArray(.25);
    allocateRandomArray(.375);
    allocateRandomArray(.5);
    allocateRandomArray(.75);
    allocateRandomArray(1);
    allocateRandomArray(1.5);
    allocateRandomArray(2);
    allocateRandomArray(3);
    allocateRandomArray(4);
    allocateRandomArray(6);
    allocateRandomArray(8);
    allocateRandomArray(12);
    allocateRandomArray(16);
    allocateRandomArray(24);
    allocateRandomArray(32);
    allocateRandomArray(48);
    allocateRandomArray(64);
    allocateRandomArray(96);
    allocateRandomArray(128);
    allocateRandomArray(192);
}

void allocateRandomArray(long double size){
    int accessSize=(1024*1024*size); //array size in bytes
    char * randomArray = malloc(accessSize*sizeof(char));    //allocate array of size allocate size
    int counter;
    int strideSize=4096;        //step size

    char ** head = (char **) randomArray;   //start of linked list in contiguous memory
    char ** iterate = head;         //iterator for linked list
    for(counter=0; counter < accessSize; counter+=strideSize){      
        (*iterate) = &randomArray[counter+strideSize];      //iterate through linked list, having each one point stride bytes forward
        iterate+=(strideSize/sizeof(iterate));          //increment iterator stride bytes forward
    }
    *iterate = (char *) head;       //set tailf to point to head

    accessArray(randomArray, size, head);
    free(randomArray);
}

void accessArray(char *cacheArray, long double size, char** head){
    const long double NUM_ACCESSES = 1000000000/100;    //number of accesses to linked list
    const int SECONDS_PER_NS = 1000000000;      //const for timer
    FILE *fp =  fopen("accessData.txt", "a");   //open file for writing data
    int newIndex=0;
    int counter=0;
    int read=0;
    struct timespec startAccess, endAccess;     //struct for timer
    long double accessTime = 0;
    char ** iterate = head;     //create iterator

    clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
    for(counter=0; counter < NUM_ACCESSES; counter++){
        HUNDO       //macro subsitute 100 accesses to mitigate loop overhead
    }
    clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
    //calculate the time elapsed in ns per access
    accessTime = (((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec)) / (100*NUM_ACCESSES);
    fprintf(fp, "%Lf\t%Lf\n", accessTime, size);  //print results to file
    fclose(fp);  //close file
}

这种方法产生了最一致的结果,使用各种数组大小并绘制相应的延迟时间,可以非常清晰地区分不同的缓存大小。

下一个方法与前一个方法类似,分配递增大小的数组。但是,我没有使用链表进行内存访问,而是将每个索引填充其相应的数字,并随机打乱数组。然后,我使用这些索引在数组内随机跳跃以进行访问,从而减轻预取器的影响。然而,当多个相邻的缓存行被拉入并且恰好被命中时,它会偶尔出现强烈的访问时间偏差。

#include <time.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>

//prototype
void allocateRandomArray(long double);
void accessArray(int *, long int);

int main(){
    srand(time(NULL));  // Seed random function
    int i=0;
    for(i=2; i < 32; i++){
        allocateRandomArray(pow(2, i));         //call latency function on arrays of increasing size
    }


}

void allocateRandomArray(long double size){
    int accessSize = (size) / sizeof(int);
    int * randomArray = malloc(accessSize*sizeof(int));
    int counter;

    for(counter=0; counter < accessSize; counter ++){
        randomArray[counter] = counter; 
    }
    for(counter=0; counter < accessSize; counter ++){
        int i,j;
        int swap;
        i = rand() % accessSize;
        j = rand() % accessSize;
        swap = randomArray[i];
        randomArray[i] = randomArray[j];
        randomArray[j] = swap;
    } 

    accessArray(randomArray, accessSize);
    free(randomArray);
}

void accessArray(int *cacheArray, long int size){
    const long double NUM_ACCESSES = 1000000000;
    const int SECONDS_PER_NS = 1000000000;
    int newIndex=0;
    int counter=0;
    int read=0;
    struct timespec startAccess, endAccess;
    long double accessTime = 0;

    clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
    for(counter = 0; counter < NUM_ACCESSES; counter++){
        newIndex=cacheArray[newIndex];
    }
    clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
    //calculate the time elapsed in ns per access
    accessTime = (((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec)) / (NUM_ACCESSES);
    printf("Access time: %Lf for size %ld\n", accessTime, size);
} 

在许多次试验的平均值下,这种方法也能产生相对准确的结果。第一选择肯定是更好的,但这是一个同样有效的替代方法。

allocateRandomArray 函数有一个 bug。它会随机选择 i 和 j,但是可能从未交换起始索引 0。这会导致 accessArray 函数一直采样索引 0。修复方法是将 i 设置为计数器。 - Pyrolistical

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接