编辑:供参考(如果有人偶然发现了这个问题),Igor Ostrovsky写了一篇关于缓存未命中的绝妙文章。它讨论了几个不同的问题并展示了例子数据。结束编辑
我进行了一些测试<long story goes here>
,想知道性能差异是否由内存缓存未命中导致。以下代码演示了问题,并将其简化为关键的时间部分。以下代码包含几个循环,以随机顺序和按地址升序访问内存。
我在XP机器上运行了它(使用VS2005编译:cl /O2),还在Linux盒子上运行了它(gcc –Os)。两个产生了类似的时间。这些时间以毫秒为单位。我相信所有循环都在运行而且没有被优化掉(否则它会“瞬间”运行)。
*** 测试 20000 个节点 总共排序时间:888.822899 总共随机时间:2155.846268
这些数字有意义吗?差异主要是由 L1 缓存未命中引起的,还是其他原因也在发生?有 20,000^2 的内存访问,如果每一个都是缓存未命中,那么大约是每个未命中需要 3.2 纳秒。我测试的 XP(P4)机器速度为 3.2GHz,并且我认为(但不确定)它具有 32KB 的 L1 缓存和 512KB 的 L2 缓存。对于 20,000 个条目(80KB),我认为没有显著数量的 L2 缓存未命中。因此这将是 (3.2*10^9 次/秒) * 3.2*10^-9 秒/未命中) = 10.1 次/未命中
。这对我来说似乎很高。也许不是这样,或者也许我的数学很差。我尝试使用 VTune 测量缓存未命中,但我得到了一个蓝屏。现在我无法连接到许可证服务器(grrrr)。
typedef struct stItem
{
long lData;
//char acPad[20];
} LIST_NODE;
#if defined( WIN32 )
void StartTimer( LONGLONG *pt1 )
{
QueryPerformanceCounter( (LARGE_INTEGER*)pt1 );
}
void StopTimer( LONGLONG t1, double *pdMS )
{
LONGLONG t2, llFreq;
QueryPerformanceCounter( (LARGE_INTEGER*)&t2 );
QueryPerformanceFrequency( (LARGE_INTEGER*)&llFreq );
*pdMS = ((double)( t2 - t1 ) / (double)llFreq) * 1000.0;
}
#else
// doesn't need 64-bit integer in this case
void StartTimer( LONGLONG *pt1 )
{
// Just use clock(), this test doesn't need higher resolution
*pt1 = clock();
}
void StopTimer( LONGLONG t1, double *pdMS )
{
LONGLONG t2 = clock();
*pdMS = (double)( t2 - t1 ) / ( CLOCKS_PER_SEC / 1000 );
}
#endif
long longrand()
{
#if defined( WIN32 )
// Stupid cheesy way to make sure it is not just a 16-bit rand value
return ( rand() << 16 ) | rand();
#else
return rand();
#endif
}
// get random value in the given range
int randint( int m, int n )
{
int ret = longrand() % ( n - m + 1 );
return ret + m;
}
// I think I got this out of Programming Pearls (Bentley).
void ShuffleArray
(
long *plShuffle, // (O) return array of "randomly" ordered integers
long lNumItems // (I) length of array
)
{
long i;
long j;
long t;
for ( i = 0; i < lNumItems; i++ )
plShuffle[i] = i;
for ( i = 0; i < lNumItems; i++ )
{
j = randint( i, lNumItems - 1 );
t = plShuffle[i];
plShuffle[i] = plShuffle[j];
plShuffle[j] = t;
}
}
int main( int argc, char* argv[] )
{
long *plDataValues;
LIST_NODE *pstNodes;
long lNumItems = 20000;
long i, j;
LONGLONG t1; // for timing
double dms;
if ( argc > 1 && atoi(argv[1]) > 0 )
lNumItems = atoi( argv[1] );
printf( "\n\n*** Testing %u nodes\n", lNumItems );
srand( (unsigned int)time( 0 ));
// allocate the nodes as one single chunk of memory
pstNodes = (LIST_NODE*)malloc( lNumItems * sizeof( LIST_NODE ));
assert( pstNodes != NULL );
// Create an array that gives the access order for the nodes
plDataValues = (long*)malloc( lNumItems * sizeof( long ));
assert( plDataValues != NULL );
// Access the data in order
for ( i = 0; i < lNumItems; i++ )
plDataValues[i] = i;
StartTimer( &t1 );
// Loop through and access the memory a bunch of times
for ( j = 0; j < lNumItems; j++ )
{
for ( i = 0; i < lNumItems; i++ )
{
pstNodes[plDataValues[i]].lData = i * j;
}
}
StopTimer( t1, &dms );
printf( "Total Ordered Time: %f\n", dms );
// now access the array positions in a "random" order
ShuffleArray( plDataValues, lNumItems );
StartTimer( &t1 );
for ( j = 0; j < lNumItems; j++ )
{
for ( i = 0; i < lNumItems; i++ )
{
pstNodes[plDataValues[i]].lData = i * j;
}
}
StopTimer( t1, &dms );
printf( "Total Random Time: %f\n", dms );
}